616 files changed, 25151 insertions, 15170 deletions
diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index 86a2edbd8bd4..de37c391cf25 100644
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -103,12 +103,15 @@ static std::optional<TypeSize> getObjectSize(const Value *V,
                                              const TargetLibraryInfo &TLI,
                                              bool NullIsValidLoc,
                                              bool RoundToAlign = false) {
-  uint64_t Size;
   ObjectSizeOpts Opts;
   Opts.RoundToAlign = RoundToAlign;
   Opts.NullIsUnknownSize = NullIsValidLoc;
-  if (getObjectSize(V, Size, DL, &TLI, Opts))
-    return TypeSize::getFixed(Size);
+  if (std::optional<TypeSize> Size = getBaseObjectSize(V, DL, &TLI, Opts)) {
+    // FIXME: Remove this check, only exists to preserve previous behavior.
+    if (Size->isScalable())
+      return std::nullopt;
+    return Size;
+  }
   return std::nullopt;
 }
 
@@ -227,9 +230,9 @@ EarliestEscapeAnalysis::getCapturesBefore(const Value *Object,
   auto Iter = EarliestEscapes.try_emplace(Object);
   if (Iter.second) {
     std::pair<Instruction *, CaptureComponents> EarliestCapture =
-        FindEarliestCapture(
-            Object, *const_cast<Function *>(DT.getRoot()->getParent()),
-            /*ReturnCaptures=*/false, DT, CaptureComponents::Provenance);
+        FindEarliestCapture(Object, *DT.getRoot()->getParent(),
+                            /*ReturnCaptures=*/false, DT,
+                            CaptureComponents::Provenance);
     if (EarliestCapture.first)
       Inst2Obj[EarliestCapture.first].push_back(Object);
     Iter.first->second = EarliestCapture;
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 2148431c1acc..a136e8718435 100644..100755
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -1652,6 +1652,13 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
   case Intrinsic::amdgcn_perm:
   case Intrinsic::amdgcn_wave_reduce_umin:
   case Intrinsic::amdgcn_wave_reduce_umax:
+  case Intrinsic::amdgcn_wave_reduce_max:
+  case Intrinsic::amdgcn_wave_reduce_min:
+  case Intrinsic::amdgcn_wave_reduce_add:
+  case Intrinsic::amdgcn_wave_reduce_sub:
+  case Intrinsic::amdgcn_wave_reduce_and:
+  case Intrinsic::amdgcn_wave_reduce_or:
+  case Intrinsic::amdgcn_wave_reduce_xor:
   case Intrinsic::amdgcn_s_wqm:
   case Intrinsic::amdgcn_s_quadmask:
   case Intrinsic::amdgcn_s_bitreplicate:
@@ -3672,6 +3679,13 @@ static Constant *ConstantFoldIntrinsicCall2(Intrinsic::ID IntrinsicID, Type *Ty,
       return ConstantInt::get(Ty, C0->abs());
     case Intrinsic::amdgcn_wave_reduce_umin:
     case Intrinsic::amdgcn_wave_reduce_umax:
+    case Intrinsic::amdgcn_wave_reduce_max:
+    case Intrinsic::amdgcn_wave_reduce_min:
+    case Intrinsic::amdgcn_wave_reduce_add:
+    case Intrinsic::amdgcn_wave_reduce_sub:
+    case Intrinsic::amdgcn_wave_reduce_and:
+    case Intrinsic::amdgcn_wave_reduce_or:
+    case Intrinsic::amdgcn_wave_reduce_xor:
       return dyn_cast<Constant>(Operands[0]);
     }
 
@@ -4608,4 +4622,55 @@ bool llvm::isMathLibCallNoop(const CallBase *Call,
   return false;
 }
 
+Constant *llvm::getLosslessInvCast(Constant *C, Type *InvCastTo,
+                                   unsigned CastOp, const DataLayout &DL,
+                                   PreservedCastFlags *Flags) {
+  switch (CastOp) {
+  case Instruction::BitCast:
+    // Bitcast is always lossless.
+    return ConstantFoldCastOperand(Instruction::BitCast, C, InvCastTo, DL);
+  case Instruction::Trunc: {
+    auto *ZExtC = ConstantFoldCastOperand(Instruction::ZExt, C, InvCastTo, DL);
+    if (Flags) {
+      // Truncation back on ZExt value is always NUW.
+      Flags->NUW = true;
+      // Test positivity of C.
+      auto *SExtC =
+          ConstantFoldCastOperand(Instruction::SExt, C, InvCastTo, DL);
+      Flags->NSW = ZExtC == SExtC;
+    }
+    return ZExtC;
+  }
+  case Instruction::SExt:
+  case Instruction::ZExt: {
+    auto *InvC = ConstantExpr::getTrunc(C, InvCastTo);
+    auto *CastInvC = ConstantFoldCastOperand(CastOp, InvC, C->getType(), DL);
+    // Must satisfy CastOp(InvC) == C.
+    if (!CastInvC || CastInvC != C)
+      return nullptr;
+    if (Flags && CastOp == Instruction::ZExt) {
+      auto *SExtInvC =
+          ConstantFoldCastOperand(Instruction::SExt, InvC, C->getType(), DL);
+      // Test positivity of InvC.
+      Flags->NNeg = CastInvC == SExtInvC;
+    }
+    return InvC;
+  }
+  default:
+    return nullptr;
+  }
+}
+
+Constant *llvm::getLosslessUnsignedTrunc(Constant *C, Type *DestTy,
+                                         const DataLayout &DL,
+                                         PreservedCastFlags *Flags) {
+  return getLosslessInvCast(C, DestTy, Instruction::ZExt, DL, Flags);
+}
+
+Constant *llvm::getLosslessSignedTrunc(Constant *C, Type *DestTy,
+                                       const DataLayout &DL,
+                                       PreservedCastFlags *Flags) {
+  return getLosslessInvCast(C, DestTy, Instruction::SExt, DL, Flags);
+}
+
 void TargetFolder::anchor() {}
diff --git a/llvm/lib/Analysis/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp
index 3a7066602924..b78cc03e34db 100644
--- a/llvm/lib/Analysis/DXILResource.cpp
+++ b/llvm/lib/Analysis/DXILResource.cpp
@@ -786,7 +786,7 @@ StringRef dxil::getResourceNameFromBindingCall(CallInst *CI) {
     llvm_unreachable("unexpected handle creation intrinsic");
   case Intrinsic::dx_resource_handlefrombinding:
   case Intrinsic::dx_resource_handlefromimplicitbinding:
-    Op = CI->getArgOperand(5);
+    Op = CI->getArgOperand(4);
     break;
   }
 
@@ -1010,7 +1010,7 @@ void DXILResourceBindingInfo::populate(Module &M, DXILResourceTypeMap &DRTM) {
               cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
           int32_t Size =
               cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
-          Value *Name = CI->getArgOperand(5);
+          Value *Name = CI->getArgOperand(4);
 
           // negative size means unbounded resource array;
           // upper bound register overflow should be detected in Sema
diff --git a/llvm/lib/Analysis/Delinearization.cpp b/llvm/lib/Analysis/Delinearization.cpp
index 762d9191aab1..4064b25d9d4e 100644
--- a/llvm/lib/Analysis/Delinearization.cpp
+++ b/llvm/lib/Analysis/Delinearization.cpp
@@ -182,7 +182,7 @@ void llvm::collectParametricTerms(ScalarEvolution &SE, const SCEV *Expr,
   LLVM_DEBUG({
     dbgs() << "Strides:\n";
     for (const SCEV *S : Strides)
-      dbgs() << *S << "\n";
+      dbgs().indent(2) << *S << "\n";
   });
 
   for (const SCEV *S : Strides) {
@@ -193,7 +193,7 @@ void llvm::collectParametricTerms(ScalarEvolution &SE, const SCEV *Expr,
   LLVM_DEBUG({
     dbgs() << "Terms:\n";
     for (const SCEV *T : Terms)
-      dbgs() << *T << "\n";
+      dbgs().indent(2) << *T << "\n";
   });
 
   SCEVCollectAddRecMultiplies MulCollector(Terms, SE);
@@ -294,7 +294,7 @@ void llvm::findArrayDimensions(ScalarEvolution &SE,
   LLVM_DEBUG({
     dbgs() << "Terms:\n";
     for (const SCEV *T : Terms)
-      dbgs() << *T << "\n";
+      dbgs().indent(2) << *T << "\n";
   });
 
   // Remove duplicates.
@@ -325,7 +325,7 @@ void llvm::findArrayDimensions(ScalarEvolution &SE,
   LLVM_DEBUG({
     dbgs() << "Terms after sorting:\n";
     for (const SCEV *T : NewTerms)
-      dbgs() << *T << "\n";
+      dbgs().indent(2) << *T << "\n";
   });
 
   if (NewTerms.empty() || !findArrayDimensionsRec(SE, NewTerms, Sizes)) {
@@ -339,7 +339,7 @@ void llvm::findArrayDimensions(ScalarEvolution &SE,
   LLVM_DEBUG({
     dbgs() << "Sizes:\n";
     for (const SCEV *S : Sizes)
-      dbgs() << *S << "\n";
+      dbgs().indent(2) << *S << "\n";
   });
 }
 
@@ -354,18 +354,24 @@ void llvm::computeAccessFunctions(ScalarEvolution &SE, const SCEV *Expr,
     if (!AR->isAffine())
       return;
 
+  LLVM_DEBUG(dbgs() << "\ncomputeAccessFunctions\n"
+                    << "Memory Access Function: " << *Expr << "\n");
+
   const SCEV *Res = Expr;
   int Last = Sizes.size() - 1;
+
   for (int i = Last; i >= 0; i--) {
+    const SCEV *Size = Sizes[i];
     const SCEV *Q, *R;
-    SCEVDivision::divide(SE, Res, Sizes[i], &Q, &R);
+
+    SCEVDivision::divide(SE, Res, Size, &Q, &R);
 
     LLVM_DEBUG({
-      dbgs() << "Res: " << *Res << "\n";
-      dbgs() << "Sizes[i]: " << *Sizes[i] << "\n";
-      dbgs() << "Res divided by Sizes[i]:\n";
-      dbgs() << "Quotient: " << *Q << "\n";
-      dbgs() << "Remainder: " << *R << "\n";
+      dbgs() << "Computing 'MemAccFn / Sizes[" << i << "]':\n";
+      dbgs() << "  MemAccFn: " << *Res << "\n";
+      dbgs() << "  Sizes[" << i << "]: " << *Size << "\n";
+      dbgs() << "  Quotient (Leftover): " << *Q << "\n";
+      dbgs() << "  Remainder (Subscript Access Function): " << *R << "\n";
     });
 
     Res = Q;
@@ -397,7 +403,8 @@ void llvm::computeAccessFunctions(ScalarEvolution &SE, const SCEV *Expr,
   LLVM_DEBUG({
     dbgs() << "Subscripts:\n";
     for (const SCEV *S : Subscripts)
-      dbgs() << *S << "\n";
+      dbgs().indent(2) << *S << "\n";
+    dbgs() << "\n";
   });
 }
 
@@ -469,21 +476,6 @@ void llvm::delinearize(ScalarEvolution &SE, const SCEV *Expr,
 
   // Third step: compute the access functions for each subscript.
   computeAccessFunctions(SE, Expr, Subscripts, Sizes);
-
-  if (Subscripts.empty())
-    return;
-
-  LLVM_DEBUG({
-    dbgs() << "succeeded to delinearize " << *Expr << "\n";
-    dbgs() << "ArrayDecl[UnknownSize]";
-    for (const SCEV *S : Sizes)
-      dbgs() << "[" << *S << "]";
-
-    dbgs() << "\nArrayRef";
-    for (const SCEV *S : Subscripts)
-      dbgs() << "[" << *S << "]";
-    dbgs() << "\n";
-  });
 }
 
 static std::optional<APInt> tryIntoAPInt(const SCEV *S) {
@@ -671,6 +663,7 @@ bool llvm::getIndexExpressionsFromGEP(ScalarEvolution &SE,
   assert(Subscripts.empty() && Sizes.empty() &&
          "Expected output lists to be empty on entry to this function.");
   assert(GEP && "getIndexExpressionsFromGEP called with a null GEP");
+  LLVM_DEBUG(dbgs() << "\nGEP to delinearize: " << *GEP << "\n");
   Type *Ty = nullptr;
   bool DroppedFirstDim = false;
   for (unsigned i = 1; i < GEP->getNumOperands(); i++) {
@@ -688,6 +681,8 @@ bool llvm::getIndexExpressionsFromGEP(ScalarEvolution &SE,
 
     auto *ArrayTy = dyn_cast<ArrayType>(Ty);
     if (!ArrayTy) {
+      LLVM_DEBUG(dbgs() << "GEP delinearize failed: " << *Ty
+                        << " is not an array type.\n");
       Subscripts.clear();
       Sizes.clear();
       return false;
@@ -699,6 +694,13 @@ bool llvm::getIndexExpressionsFromGEP(ScalarEvolution &SE,
 
     Ty = ArrayTy->getElementType();
   }
+  LLVM_DEBUG({
+    dbgs() << "Subscripts:\n";
+    for (const SCEV *S : Subscripts)
+      dbgs() << *S << "\n";
+    dbgs() << "\n";
+  });
+
   return !Subscripts.empty();
 }
 
@@ -769,7 +771,6 @@ void printDelinearization(raw_ostream &O, Function *F, LoopInfo *LI,
 
     O << "\n";
     O << "Inst:" << Inst << "\n";
-    O << "In Loop with Header: " << L->getHeader()->getName() << "\n";
     O << "AccessFunction: " << *AccessFn << "\n";
 
     SmallVector<const SCEV *, 3> Subscripts, Sizes;
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index f33e04e804e3..da86a8d2cc9c 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -3419,13 +3419,24 @@ bool DependenceInfo::tryDelinearizeFixedSize(
       size_t SSize = Subscripts.size();
       for (size_t I = 1; I < SSize; ++I) {
         const SCEV *S = Subscripts[I];
-        if (!isKnownNonNegative(S, Ptr))
+        if (!isKnownNonNegative(S, Ptr)) {
+          LLVM_DEBUG({
+            dbgs() << "Check failed: !isKnownNonNegative(S, Ptr)\n";
+            dbgs() << "  S: " << *S << "\n" << "  Ptr: " << *Ptr << "\n";
+          });
           return false;
+        }
         if (auto *SType = dyn_cast<IntegerType>(S->getType())) {
           const SCEV *Range = SE->getConstant(
               ConstantInt::get(SType, DimensionSizes[I - 1], false));
-          if (!isKnownLessThan(S, Range))
+          if (!isKnownLessThan(S, Range)) {
+            LLVM_DEBUG({
+              dbgs() << "Check failed: !isKnownLessThan(S, Range)\n";
+              dbgs() << "  S: " << *S << "\n"
+                     << "  Range: " << *Range << "\n";
+            });
             return false;
+          }
         }
       }
       return true;
@@ -3433,6 +3444,7 @@ bool DependenceInfo::tryDelinearizeFixedSize(
 
     if (!AllIndicesInRange(SrcSizes, SrcSubscripts, SrcPtr) ||
         !AllIndicesInRange(DstSizes, DstSubscripts, DstPtr)) {
+      LLVM_DEBUG(dbgs() << "Check failed: AllIndicesInRange.\n");
       SrcSubscripts.clear();
       DstSubscripts.clear();
       return false;
@@ -3500,17 +3512,27 @@ bool DependenceInfo::tryDelinearizeParametricSize(
   // to the dependency checks.
   if (!DisableDelinearizationChecks)
     for (size_t I = 1; I < Size; ++I) {
-      if (!isKnownNonNegative(SrcSubscripts[I], SrcPtr))
-        return false;
-
-      if (!isKnownLessThan(SrcSubscripts[I], Sizes[I - 1]))
-        return false;
-
-      if (!isKnownNonNegative(DstSubscripts[I], DstPtr))
-        return false;
+      bool SNN = isKnownNonNegative(SrcSubscripts[I], SrcPtr);
+      bool DNN = isKnownNonNegative(DstSubscripts[I], DstPtr);
+      bool SLT = isKnownLessThan(SrcSubscripts[I], Sizes[I - 1]);
+      bool DLT = isKnownLessThan(DstSubscripts[I], Sizes[I - 1]);
+      if (SNN && DNN && SLT && DLT)
+        continue;
 
-      if (!isKnownLessThan(DstSubscripts[I], Sizes[I - 1]))
-        return false;
+      LLVM_DEBUG({
+        dbgs() << "Delinearization checks failed: can't prove the following\n";
+        if (!SNN)
+          dbgs() << "  isKnownNonNegative(" << *SrcSubscripts[I] << ")\n";
+        if (!DNN)
+          dbgs() << "  isKnownNonNegative(" << *DstSubscripts[I] << ")\n";
+        if (!SLT)
+          dbgs() << "  isKnownLessThan(" << *SrcSubscripts[I] << ", "
+                 << *Sizes[I - 1] << ")\n";
+        if (!DLT)
+          dbgs() << "  isKnownLessThan(" << *DstSubscripts[I] << ", "
+                 << *Sizes[I - 1] << ")\n";
+      });
+      return false;
     }
 
   return true;
diff --git a/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp b/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp
index 790e00e1b3b0..67e38ab8b35a 100644
--- a/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp
+++ b/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp
@@ -97,7 +97,8 @@ struct InlineEvent {
 /// Collect data we may use for training a model.
 class TrainingLogger final {
 public:
-  TrainingLogger(StringRef LogFileName, const ModelUnderTrainingRunner *MUTR);
+  TrainingLogger(StringRef LogFileName, const ModelUnderTrainingRunner *MUTR,
+                 const std::vector<TensorSpec> &FeatureMap);
 
   /// Log one inlining event.
   void logInlineEvent(const InlineEvent &Event,
@@ -106,6 +107,8 @@ public:
 private:
   StringRef LogFileName;
   const ModelUnderTrainingRunner *const MUTR;
+  const std::vector<TensorSpec> &FeatureMap;
+
   std::unique_ptr<Logger> L;
   BitVector Effects;
   /// Set these 2 clearly OOB, to make sure we set them later.
@@ -142,9 +145,10 @@ class DevelopmentModeMLInlineAdvisor : public MLInlineAdvisor {
 public:
   DevelopmentModeMLInlineAdvisor(
       Module &M, ModuleAnalysisManager &MAM,
-      std::unique_ptr<MLModelRunner> ModelRunner,
-      std::function<bool(CallBase &)> GetDefaultAdvice,
-      std::unique_ptr<TrainingLogger> Logger);
+      std::function<
+          std::unique_ptr<MLModelRunner>(const std::vector<TensorSpec> &)>
+          GetModelRunner,
+      std::function<bool(CallBase &)> GetDefaultAdvice);
 
   size_t getTotalSizeEstimate();
 
@@ -258,9 +262,13 @@ static const std::vector<TensorSpec> TrainingOnlyFeatures{
     TensorSpec::createSpec<float>(TFFeedPrefix + "reward", {1}),
     TensorSpec::createSpec<int32_t>(TFFeedPrefix + "step_type", {1})};
 
-static const std::vector<TensorSpec> getInputFeatures() {
+// add TFFeedPrefix to the names and also add the "TrainingOnlyFeatures" which
+// the model runner needs to see present. We don't set them ourselves or
+// interact with them.
+static const std::vector<TensorSpec>
+convertInputFeatures(const std::vector<TensorSpec> &OriginalFeatures) {
   std::vector<TensorSpec> InputSpecs;
-  for (const auto &Feature : FeatureMap)
+  for (const auto &Feature : OriginalFeatures)
     InputSpecs.push_back(TensorSpec(TFFeedPrefix + Feature.name(), Feature));
   append_range(InputSpecs, TrainingOnlyFeatures);
   return InputSpecs;
@@ -269,8 +277,9 @@ static const std::vector<TensorSpec> getInputFeatures() {
 } // namespace
 
 TrainingLogger::TrainingLogger(StringRef LogFileName,
-                               const ModelUnderTrainingRunner *MUTR)
-    : LogFileName(LogFileName), MUTR(MUTR) {
+                               const ModelUnderTrainingRunner *MUTR,
+                               const std::vector<TensorSpec> &FeatureMap)
+    : LogFileName(LogFileName), MUTR(MUTR), FeatureMap(FeatureMap) {
   // The first output is the inlining decision.
   std::vector<TensorSpec> FT(FeatureMap.begin(), FeatureMap.end());
 
@@ -298,8 +307,7 @@ void TrainingLogger::logInlineEvent(const InlineEvent &Event,
                                     const MLModelRunner &ModelRunner) {
   L->startObservation();
   size_t CurrentFeature = 0;
-  size_t FeatureMapSize = FeatureMap.size();
-  for (; CurrentFeature < FeatureMapSize; ++CurrentFeature)
+  for (; CurrentFeature < FeatureMap.size(); ++CurrentFeature)
     L->logTensorValue(CurrentFeature,
                       reinterpret_cast<const char *>(
                           ModelRunner.getTensorUntyped(CurrentFeature)));
@@ -327,15 +335,19 @@ void TrainingLogger::logInlineEvent(const InlineEvent &Event,
 
 DevelopmentModeMLInlineAdvisor::DevelopmentModeMLInlineAdvisor(
     Module &M, ModuleAnalysisManager &MAM,
-    std::unique_ptr<MLModelRunner> ModelRunner,
-    std::function<bool(CallBase &)> GetDefaultAdvice,
-    std::unique_ptr<TrainingLogger> Logger)
-    : MLInlineAdvisor(M, MAM, std::move(ModelRunner), GetDefaultAdvice),
+    std::function<
+        std::unique_ptr<MLModelRunner>(const std::vector<TensorSpec> &)>
+        GetModelRunner,
+    std::function<bool(CallBase &)> GetDefaultAdvice)
+    : MLInlineAdvisor(M, MAM, GetModelRunner, GetDefaultAdvice),
       IsDoingInference(isa<ModelUnderTrainingRunner>(getModelRunner())),
-      Logger(std::move(Logger)),
       InitialNativeSize(isLogging() ? getTotalSizeEstimate() : 0),
       CurrentNativeSize(InitialNativeSize) {
   // We cannot have the case of neither inference nor logging.
+  if (!TrainingLog.empty())
+    Logger = std::make_unique<TrainingLogger>(
+        TrainingLog, dyn_cast<ModelUnderTrainingRunner>(ModelRunner.get()),
+        getFeatureMap());
   assert(IsDoingInference || isLogging());
 }
 
@@ -401,21 +413,22 @@ std::unique_ptr<InlineAdvisor> llvm::getDevelopmentModeAdvisor(
     Module &M, ModuleAnalysisManager &MAM,
     std::function<bool(CallBase &)> GetDefaultAdvice) {
   auto &Ctx = M.getContext();
-  std::unique_ptr<MLModelRunner> Runner;
-  if (TFModelUnderTrainingPath.empty())
-    Runner.reset(new NoInferenceModelRunner(Ctx, getInputFeatures()));
-  else
-    Runner = ModelUnderTrainingRunner::createAndEnsureValid(
-        Ctx, TFModelUnderTrainingPath, DecisionName, getInputFeatures(),
-        TFOutputSpecOverride);
-  if (!Runner)
-    return nullptr;
-  std::unique_ptr<TrainingLogger> Logger;
-  if (!TrainingLog.empty())
-    Logger = std::make_unique<TrainingLogger>(
-        TrainingLog, dyn_cast<ModelUnderTrainingRunner>(Runner.get()));
-
-  return std::make_unique<DevelopmentModeMLInlineAdvisor>(
-      M, MAM, std::move(Runner), GetDefaultAdvice, std::move(Logger));
+  auto RunnerFactory = [&](const std::vector<TensorSpec> &InputFeatures)
+      -> std::unique_ptr<MLModelRunner> {
+    std::unique_ptr<MLModelRunner> Runner;
+    const std::vector<TensorSpec> ConvertedFeatures =
+        convertInputFeatures(InputFeatures);
+    if (TFModelUnderTrainingPath.empty())
+      Runner.reset(new NoInferenceModelRunner(Ctx, ConvertedFeatures));
+    else
+      Runner = ModelUnderTrainingRunner::createAndEnsureValid(
+          Ctx, TFModelUnderTrainingPath, DecisionName, ConvertedFeatures,
+          TFOutputSpecOverride);
+    if (!Runner)
+      return nullptr;
+    return Runner;
+  };
+  return std::make_unique<DevelopmentModeMLInlineAdvisor>(M, MAM, RunnerFactory,
+                                                          GetDefaultAdvice);
 }
 #endif // defined(LLVM_HAVE_TFLITE)
diff --git a/llvm/lib/Analysis/HashRecognize.cpp b/llvm/lib/Analysis/HashRecognize.cpp
index 92c9e37dbb48..5d7ee1fe8eb1 100644
--- a/llvm/lib/Analysis/HashRecognize.cpp
+++ b/llvm/lib/Analysis/HashRecognize.cpp
@@ -8,8 +8,10 @@
 //
 // The HashRecognize analysis recognizes unoptimized polynomial hash functions
 // with operations over a Galois field of characteristic 2, also called binary
-// fields, or GF(2^n): this class of hash functions can be optimized using a
-// lookup-table-driven implementation, or with target-specific instructions.
+// fields, or GF(2^n). 2^n is termed the order of the Galois field. This class
+// of hash functions can be optimized using a lookup-table-driven
+// implementation, or with target-specific instructions.
+//
 // Examples:
 //
 //  1. Cyclic redundancy check (CRC), which is a polynomial division in GF(2).
@@ -24,12 +26,10 @@
 //
 //    c_m * x^m + c_(m-1) * x^(m-1) + ... + c_0 * x^0
 //
-// where each coefficient c is can take values in GF(2^n), where 2^n is termed
-// the order of the Galois field. For GF(2), each coefficient can take values
-// either 0 or 1, and the polynomial is simply represented by m+1 bits,
-// corresponding to the coefficients. The different variants of CRC are named by
-// degree of generating polynomial used: so CRC-32 would use a polynomial of
-// degree 32.
+// where each coefficient c is can take values 0 or 1. The polynomial is simply
+// represented by m+1 bits, corresponding to the coefficients. The different
+// variants of CRC are named by degree of generating polynomial used: so CRC-32
+// would use a polynomial of degree 32.
 //
 // The reason algorithms on GF(2^n) can be optimized with a lookup-table is the
 // following: in such fields, polynomial addition and subtraction are identical
@@ -73,202 +73,31 @@ using namespace SCEVPatternMatch;
 
 #define DEBUG_TYPE "hash-recognize"
 
-// KnownBits for a PHI node. There are at most two PHI nodes, corresponding to
-// the Simple Recurrence and Conditional Recurrence. The IndVar PHI is not
-// relevant.
-using KnownPhiMap = SmallDenseMap<const PHINode *, KnownBits, 2>;
-
-// A pair of a PHI node along with its incoming value from within a loop.
-using PhiStepPair = std::pair<const PHINode *, const Instruction *>;
-
-/// A much simpler version of ValueTracking, in that it computes KnownBits of
-/// values, except that it computes the evolution of KnownBits in a loop with a
-/// given trip count, and predication is specialized for a significant-bit
-/// check.
-class ValueEvolution {
-  const unsigned TripCount;
-  const bool ByteOrderSwapped;
-  APInt GenPoly;
-  StringRef ErrStr;
-
-  // Compute the KnownBits of a BinaryOperator.
-  KnownBits computeBinOp(const BinaryOperator *I);
-
-  // Compute the KnownBits of an Instruction.
-  KnownBits computeInstr(const Instruction *I);
-
-  // Compute the KnownBits of a Value.
-  KnownBits compute(const Value *V);
-
-public:
-  // ValueEvolution is meant to be constructed with the TripCount of the loop,
-  // and a boolean indicating whether the polynomial algorithm is big-endian
-  // (for the significant-bit check).
-  ValueEvolution(unsigned TripCount, bool ByteOrderSwapped);
-
-  // Given a list of PHI nodes along with their incoming value from within the
-  // loop, computeEvolutions computes the KnownBits of each of the PHI nodes on
-  // the final iteration. Returns true on success and false on error.
-  bool computeEvolutions(ArrayRef<PhiStepPair> PhiEvolutions);
-
-  // In case ValueEvolution encounters an error, this is meant to be used for a
-  // precise error message.
-  StringRef getError() const { return ErrStr; }
-
-  // A set of Instructions visited by ValueEvolution. The only unvisited
-  // instructions will be ones not on the use-def chain of the PHIs' evolutions.
+/// Checks if there's a stray instruction in the loop \p L outside of the
+/// use-def chains from \p Roots, or if we escape the loop during the use-def
+/// walk.
+static bool containsUnreachable(const Loop &L,
+                                ArrayRef<const Instruction *> Roots) {
   SmallPtrSet<const Instruction *, 16> Visited;
+  BasicBlock *Latch = L.getLoopLatch();
 
-  // The computed KnownBits for each PHI node, which is populated after
-  // computeEvolutions is called.
-  KnownPhiMap KnownPhis;
-};
-
-ValueEvolution::ValueEvolution(unsigned TripCount, bool ByteOrderSwapped)
-    : TripCount(TripCount), ByteOrderSwapped(ByteOrderSwapped) {}
-
-KnownBits ValueEvolution::computeBinOp(const BinaryOperator *I) {
-  KnownBits KnownL(compute(I->getOperand(0)));
-  KnownBits KnownR(compute(I->getOperand(1)));
-
-  switch (I->getOpcode()) {
-  case Instruction::BinaryOps::And:
-    return KnownL & KnownR;
-  case Instruction::BinaryOps::Or:
-    return KnownL | KnownR;
-  case Instruction::BinaryOps::Xor:
-    return KnownL ^ KnownR;
-  case Instruction::BinaryOps::Shl: {
-    auto *OBO = cast<OverflowingBinaryOperator>(I);
-    return KnownBits::shl(KnownL, KnownR, OBO->hasNoUnsignedWrap(),
-                          OBO->hasNoSignedWrap());
-  }
-  case Instruction::BinaryOps::LShr:
-    return KnownBits::lshr(KnownL, KnownR);
-  case Instruction::BinaryOps::AShr:
-    return KnownBits::ashr(KnownL, KnownR);
-  case Instruction::BinaryOps::Add: {
-    auto *OBO = cast<OverflowingBinaryOperator>(I);
-    return KnownBits::add(KnownL, KnownR, OBO->hasNoUnsignedWrap(),
-                          OBO->hasNoSignedWrap());
-  }
-  case Instruction::BinaryOps::Sub: {
-    auto *OBO = cast<OverflowingBinaryOperator>(I);
-    return KnownBits::sub(KnownL, KnownR, OBO->hasNoUnsignedWrap(),
-                          OBO->hasNoSignedWrap());
-  }
-  case Instruction::BinaryOps::Mul: {
-    Value *Op0 = I->getOperand(0);
-    Value *Op1 = I->getOperand(1);
-    bool SelfMultiply = Op0 == Op1 && isGuaranteedNotToBeUndef(Op0);
-    return KnownBits::mul(KnownL, KnownR, SelfMultiply);
-  }
-  case Instruction::BinaryOps::UDiv:
-    return KnownBits::udiv(KnownL, KnownR);
-  case Instruction::BinaryOps::SDiv:
-    return KnownBits::sdiv(KnownL, KnownR);
-  case Instruction::BinaryOps::URem:
-    return KnownBits::urem(KnownL, KnownR);
-  case Instruction::BinaryOps::SRem:
-    return KnownBits::srem(KnownL, KnownR);
-  default:
-    ErrStr = "Unknown BinaryOperator";
-    unsigned BitWidth = I->getType()->getScalarSizeInBits();
-    return {BitWidth};
-  }
-}
-
-KnownBits ValueEvolution::computeInstr(const Instruction *I) {
-  unsigned BitWidth = I->getType()->getScalarSizeInBits();
-
-  // computeInstr is the only entry-point that needs to update the Visited set.
-  Visited.insert(I);
+  SmallVector<const Instruction *, 16> Worklist(Roots);
+  while (!Worklist.empty()) {
+    const Instruction *I = Worklist.pop_back_val();
+    Visited.insert(I);
 
-  // We look up in the map that contains the KnownBits of the PHI from the
-  // previous iteration.
-  if (const PHINode *P = dyn_cast<PHINode>(I))
-    return KnownPhis.lookup_or(P, BitWidth);
+    if (isa<PHINode>(I))
+      continue;
 
-  // Compute the KnownBits for a Select(Cmp()), forcing it to take the branch
-  // that is predicated on the (least|most)-significant-bit check.
-  CmpPredicate Pred;
-  Value *L, *R;
-  Instruction *TV, *FV;
-  if (match(I, m_Select(m_ICmp(Pred, m_Value(L), m_Value(R)), m_Instruction(TV),
-                        m_Instruction(FV)))) {
-    Visited.insert(cast<Instruction>(I->getOperand(0)));
-
-    // We need to check LCR against [0, 2) in the little-endian case, because
-    // the RCR check is insufficient: it is simply [0, 1).
-    if (!ByteOrderSwapped) {
-      KnownBits KnownL = compute(L);
-      unsigned ICmpBW = KnownL.getBitWidth();
-      auto LCR = ConstantRange::fromKnownBits(KnownL, false);
-      auto CheckLCR = ConstantRange(APInt::getZero(ICmpBW), APInt(ICmpBW, 2));
-      if (LCR != CheckLCR) {
-        ErrStr = "Bad LHS of significant-bit-check";
-        return {BitWidth};
+    for (const Use &U : I->operands()) {
+      if (auto *UI = dyn_cast<Instruction>(U)) {
+        if (!L.contains(UI))
+          return true;
+        Worklist.push_back(UI);
       }
     }
-
-    // Check that the predication is on (most|least) significant bit.
-    KnownBits KnownR = compute(R);
-    unsigned ICmpBW = KnownR.getBitWidth();
-    auto RCR = ConstantRange::fromKnownBits(KnownR, false);
-    auto AllowedR = ConstantRange::makeAllowedICmpRegion(Pred, RCR);
-    ConstantRange CheckRCR(APInt::getZero(ICmpBW),
-                           ByteOrderSwapped ? APInt::getSignedMinValue(ICmpBW)
-                                            : APInt(ICmpBW, 1));
-
-    // We only compute KnownBits of either TV or FV, as the other value would
-    // just be a bit-shift as checked by isBigEndianBitShift.
-    if (AllowedR == CheckRCR) {
-      Visited.insert(FV);
-      return compute(TV);
-    }
-    if (AllowedR.inverse() == CheckRCR) {
-      Visited.insert(TV);
-      return compute(FV);
-    }
-
-    ErrStr = "Bad RHS of significant-bit-check";
-    return {BitWidth};
-  }
-
-  if (auto *BO = dyn_cast<BinaryOperator>(I))
-    return computeBinOp(BO);
-
-  switch (I->getOpcode()) {
-  case Instruction::CastOps::Trunc:
-    return compute(I->getOperand(0)).trunc(BitWidth);
-  case Instruction::CastOps::ZExt:
-    return compute(I->getOperand(0)).zext(BitWidth);
-  case Instruction::CastOps::SExt:
-    return compute(I->getOperand(0)).sext(BitWidth);
-  default:
-    ErrStr = "Unknown Instruction";
-    return {BitWidth};
   }
-}
-
-KnownBits ValueEvolution::compute(const Value *V) {
-  if (auto *CI = dyn_cast<ConstantInt>(V))
-    return KnownBits::makeConstant(CI->getValue());
-
-  if (auto *I = dyn_cast<Instruction>(V))
-    return computeInstr(I);
-
-  ErrStr = "Unknown Value";
-  unsigned BitWidth = V->getType()->getScalarSizeInBits();
-  return {BitWidth};
-}
-
-bool ValueEvolution::computeEvolutions(ArrayRef<PhiStepPair> PhiEvolutions) {
-  for (unsigned I = 0; I < TripCount; ++I)
-    for (auto [Phi, Step] : PhiEvolutions)
-      KnownPhis.emplace_or_assign(Phi, computeInstr(Step));
-
-  return ErrStr.empty();
+  return std::distance(Latch->begin(), Latch->end()) != Visited.size();
 }
 
 /// A structure that can hold either a Simple Recurrence or a Conditional
@@ -320,6 +149,62 @@ private:
       Instruction::BinaryOps BOWithConstOpToMatch = Instruction::BinaryOpsEnd);
 };
 
+/// Check the well-formedness of the (most|least) significant bit check given \p
+/// ConditionalRecurrence, \p SimpleRecurrence, depending on \p
+/// ByteOrderSwapped. We check that ConditionalRecurrence.Step is a
+/// Select(Cmp()) where the compare is `>= 0` in the big-endian case, and `== 0`
+/// in the little-endian case (or the inverse, in which case the branches of the
+/// compare are swapped). We check that the LHS is (ConditionalRecurrence.Phi
+/// [xor SimpleRecurrence.Phi]) in the big-endian case, and additionally check
+/// for an AND with one in the little-endian case. We then check AllowedByR
+/// against CheckAllowedByR, which is [0, smin) in the big-endian case, and is
+/// [0, 1) in the little-endian case. CheckAllowedByR checks for
+/// significant-bit-clear, and we match the corresponding arms of the select
+/// against bit-shift and bit-shift-and-xor-gen-poly.
+static bool
+isSignificantBitCheckWellFormed(const RecurrenceInfo &ConditionalRecurrence,
+                                const RecurrenceInfo &SimpleRecurrence,
+                                bool ByteOrderSwapped) {
+  auto *SI = cast<SelectInst>(ConditionalRecurrence.Step);
+  CmpPredicate Pred;
+  const Value *L;
+  const APInt *R;
+  Instruction *TV, *FV;
+  if (!match(SI, m_Select(m_ICmp(Pred, m_Value(L), m_APInt(R)),
+                          m_Instruction(TV), m_Instruction(FV))))
+    return false;
+
+  // Match predicate with or without a SimpleRecurrence (the corresponding data
+  // is LHSAux).
+  auto MatchPred = m_CombineOr(
+      m_Specific(ConditionalRecurrence.Phi),
+      m_c_Xor(m_ZExtOrTruncOrSelf(m_Specific(ConditionalRecurrence.Phi)),
+              m_ZExtOrTruncOrSelf(m_Specific(SimpleRecurrence.Phi))));
+  bool LWellFormed = ByteOrderSwapped ? match(L, MatchPred)
+                                      : match(L, m_c_And(MatchPred, m_One()));
+  if (!LWellFormed)
+    return false;
+
+  KnownBits KnownR = KnownBits::makeConstant(*R);
+  unsigned BW = KnownR.getBitWidth();
+  auto RCR = ConstantRange::fromKnownBits(KnownR, false);
+  auto AllowedByR = ConstantRange::makeAllowedICmpRegion(Pred, RCR);
+  ConstantRange CheckAllowedByR(APInt::getZero(BW),
+                                ByteOrderSwapped ? APInt::getSignedMinValue(BW)
+                                                 : APInt(BW, 1));
+
+  BinaryOperator *BitShift = ConditionalRecurrence.BO;
+  if (AllowedByR == CheckAllowedByR)
+    return TV == BitShift &&
+           match(FV, m_c_Xor(m_Specific(BitShift),
+                             m_SpecificInt(*ConditionalRecurrence.ExtraConst)));
+  if (AllowedByR.inverse() == CheckAllowedByR)
+    return FV == BitShift &&
+           match(TV, m_c_Xor(m_Specific(BitShift),
+                             m_SpecificInt(*ConditionalRecurrence.ExtraConst)));
+  return false;
+}
+
 /// Wraps llvm::matchSimpleRecurrence. Match a simple first order recurrence
 /// cycle of the form:
 ///
@@ -336,8 +221,11 @@ private:
 ///    %BO = binop %step, %rec
 ///
 bool RecurrenceInfo::matchSimpleRecurrence(const PHINode *P) {
-  Phi = P;
-  return llvm::matchSimpleRecurrence(Phi, BO, Start, Step);
+  if (llvm::matchSimpleRecurrence(P, BO, Start, Step)) {
+    Phi = P;
+    return true;
+  }
+  return false;
 }
 
 /// Digs for a recurrence starting with \p V hitting the PHI node in a use-def
@@ -459,26 +347,6 @@ PolynomialInfo::PolynomialInfo(unsigned TripCount, Value *LHS, const APInt &RHS,
     : TripCount(TripCount), LHS(LHS), RHS(RHS), ComputedValue(ComputedValue),
       ByteOrderSwapped(ByteOrderSwapped), LHSAux(LHSAux) {}
 
-/// In the big-endian case, checks the bottom N bits against CheckFn, and that
-/// the rest are unknown. In the little-endian case, checks the top N bits
-/// against CheckFn, and that the rest are unknown. Callers usually call this
-/// function with N = TripCount, and CheckFn checking that the remainder bits of
-/// the CRC polynomial division are zero.
-static bool checkExtractBits(const KnownBits &Known, unsigned N,
-                             function_ref<bool(const KnownBits &)> CheckFn,
-                             bool ByteOrderSwapped) {
-  // Check that the entire thing is a constant.
-  if (N == Known.getBitWidth())
-    return CheckFn(Known.extractBits(N, 0));
-
-  // Check that the {top, bottom} N bits are not unknown and that the {bottom,
-  // top} N bits are known.
-  unsigned BitPos = ByteOrderSwapped ? 0 : Known.getBitWidth() - N;
-  unsigned SwappedBitPos = ByteOrderSwapped ? N : 0;
-  return CheckFn(Known.extractBits(N, BitPos)) &&
-         Known.extractBits(Known.getBitWidth() - N, SwappedBitPos).isUnknown();
-}
-
 /// Generate a lookup table of 256 entries by interleaving the generating
 /// polynomial. The optimization technique of table-lookup for CRC is also
 /// called the Sarwate algorithm.
@@ -511,8 +379,6 @@ CRCTable HashRecognize::genSarwateTable(const APInt &GenPoly,
 /// Checks that \p P1 and \p P2 are used together in an XOR in the use-def chain
 /// of \p SI's condition, ignoring any casts. The purpose of this function is to
 /// ensure that LHSAux from the SimpleRecurrence is used correctly in the CRC
-/// computation. We cannot check the correctness of casts at this point, and
-/// rely on the KnownBits propagation to check correctness of the CRC
 /// computation.
 ///
 /// In other words, it checks for the following pattern:
@@ -540,8 +406,8 @@ static bool isConditionalOnXorOfPHIs(const SelectInst *SI, const PHINode *P1,
       continue;
 
     // If we match an XOR of the two PHIs ignoring casts, we're done.
-    if (match(I, m_c_Xor(m_CastOrSelf(m_Specific(P1)),
-                         m_CastOrSelf(m_Specific(P2)))))
+    if (match(I, m_c_Xor(m_ZExtOrTruncOrSelf(m_Specific(P1)),
+                         m_ZExtOrTruncOrSelf(m_Specific(P2)))))
       return true;
 
     // Continue along the use-def chain.
@@ -570,10 +436,8 @@ static std::optional<bool> isBigEndianBitShift(Value *V, ScalarEvolution &SE) {
 }
 
 /// The main entry point for analyzing a loop and recognizing the CRC algorithm.
-/// Returns a PolynomialInfo on success, and either an ErrBits or a StringRef on
-/// failure.
-std::variant<PolynomialInfo, ErrBits, StringRef>
-HashRecognize::recognizeCRC() const {
+/// Returns a PolynomialInfo on success, and a StringRef on failure.
+std::variant<PolynomialInfo, StringRef> HashRecognize::recognizeCRC() const {
   if (!L.isInnermost())
     return "Loop is not innermost";
   BasicBlock *Latch = L.getLoopLatch();
@@ -582,7 +446,7 @@ HashRecognize::recognizeCRC() const {
   if (!Latch || !Exit || !IndVar || L.getNumBlocks() != 1)
     return "Loop not in canonical form";
   unsigned TC = SE.getSmallConstantTripCount(&L);
-  if (!TC || TC > 256 || TC % 8)
+  if (!TC || TC % 8)
     return "Unable to find a small constant byte-multiple trip count";
 
   auto R = getRecurrences(Latch, IndVar, L);
@@ -637,36 +501,19 @@ HashRecognize::recognizeCRC() const {
          "Expected ExtraConst in conditional recurrence");
   const APInt &GenPoly = *ConditionalRecurrence.ExtraConst;
 
-  // PhiEvolutions are pairs of PHINodes along with their incoming value from
-  // within the loop, which we term as their step. Note that in the case of a
-  // Simple Recurrence, Step is an operand of the BO, while in a Conditional
-  // Recurrence, it is a SelectInst.
-  SmallVector<PhiStepPair, 2> PhiEvolutions;
-  PhiEvolutions.emplace_back(ConditionalRecurrence.Phi, ComputedValue);
+  if (!isSignificantBitCheckWellFormed(ConditionalRecurrence, SimpleRecurrence,
+                                       *ByteOrderSwapped))
+    return "Malformed significant-bit check";
+
+  SmallVector<const Instruction *> Roots(
+      {ComputedValue,
+       cast<Instruction>(IndVar->getIncomingValueForBlock(Latch)),
+       L.getLatchCmpInst(), Latch->getTerminator()});
   if (SimpleRecurrence)
-    PhiEvolutions.emplace_back(SimpleRecurrence.Phi, SimpleRecurrence.BO);
-
-  ValueEvolution VE(TC, *ByteOrderSwapped);
-  if (!VE.computeEvolutions(PhiEvolutions))
-    return VE.getError();
-  KnownBits ResultBits = VE.KnownPhis.at(ConditionalRecurrence.Phi);
-
-  // There must be exactly four unvisited instructions, corresponding to the
-  // IndVar PHI. Any other unvisited instructions from the KnownBits propagation
-  // can complicate the optimization, which replaces the entire loop with the
-  // table-lookup version of the hash algorithm.
-  std::initializer_list<const Instruction *> AugmentVisited = {
-      IndVar, Latch->getTerminator(), L.getLatchCmpInst(),
-      cast<Instruction>(IndVar->getIncomingValueForBlock(Latch))};
-  VE.Visited.insert_range(AugmentVisited);
-  if (std::distance(Latch->begin(), Latch->end()) != VE.Visited.size())
+    Roots.push_back(SimpleRecurrence.BO);
+  if (containsUnreachable(L, Roots))
     return "Found stray unvisited instructions";
 
-  unsigned N = std::min(TC, ResultBits.getBitWidth());
-  auto IsZero = [](const KnownBits &K) { return K.isZero(); };
-  if (!checkExtractBits(ResultBits, N, IsZero, *ByteOrderSwapped))
-    return ErrBits(ResultBits, TC, *ByteOrderSwapped);
-
   return PolynomialInfo(TC, LHS, GenPoly, ComputedValue, *ByteOrderSwapped,
                         LHSAux);
 }
@@ -693,13 +540,6 @@ void HashRecognize::print(raw_ostream &OS) const {
     OS << "Did not find a hash algorithm\n";
     if (std::holds_alternative<StringRef>(Ret))
       OS << "Reason: " << std::get<StringRef>(Ret) << "\n";
-    if (std::holds_alternative<ErrBits>(Ret)) {
-      auto [Actual, Iter, ByteOrderSwapped] = std::get<ErrBits>(Ret);
-      OS << "Reason: Expected " << (ByteOrderSwapped ? "bottom " : "top ")
-         << Iter << " bits zero (";
-      Actual.print(OS);
-      OS << ")\n";
-    }
     return;
   }
 
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index 95f30fd3f427..99afc0601d52 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -32,11 +32,11 @@ using namespace ir2vec;
 #define DEBUG_TYPE "ir2vec"
 
 STATISTIC(VocabMissCounter,
-          "Number of lookups to entites not present in the vocabulary");
+          "Number of lookups to entities not present in the vocabulary");
 
 namespace llvm {
 namespace ir2vec {
-static cl::OptionCategory IR2VecCategory("IR2Vec Options");
+cl::OptionCategory IR2VecCategory("IR2Vec Options");
 
 // FIXME: Use a default vocab when not specified
 static cl::opt<std::string>
@@ -52,6 +52,15 @@ cl::opt<float> TypeWeight("ir2vec-type-weight", cl::Optional, cl::init(0.5),
 cl::opt<float> ArgWeight("ir2vec-arg-weight", cl::Optional, cl::init(0.2),
                          cl::desc("Weight for argument embeddings"),
                          cl::cat(IR2VecCategory));
+cl::opt<IR2VecKind> IR2VecEmbeddingKind(
+    "ir2vec-kind", cl::Optional,
+    cl::values(clEnumValN(IR2VecKind::Symbolic, "symbolic",
+                          "Generate symbolic embeddings"),
+               clEnumValN(IR2VecKind::FlowAware, "flow-aware",
+                          "Generate flow-aware embeddings")),
+    cl::init(IR2VecKind::Symbolic), cl::desc("IR2Vec embedding kind"),
+    cl::cat(IR2VecCategory));
+
 } // namespace ir2vec
 } // namespace llvm
 
@@ -123,8 +132,12 @@ bool Embedding::approximatelyEquals(const Embedding &RHS,
                                     double Tolerance) const {
   assert(this->size() == RHS.size() && "Vectors must have the same dimension");
   for (size_t Itr = 0; Itr < this->size(); ++Itr)
-    if (std::abs((*this)[Itr] - RHS[Itr]) > Tolerance)
+    if (std::abs((*this)[Itr] - RHS[Itr]) > Tolerance) {
+      LLVM_DEBUG(errs() << "Embedding mismatch at index " << Itr << ": "
+                        << (*this)[Itr] << " vs " << RHS[Itr]
+                        << "; Tolerance: " << Tolerance << "\n");
       return false;
+    }
   return true;
 }
 
@@ -141,14 +154,16 @@ void Embedding::print(raw_ostream &OS) const {
 
 Embedder::Embedder(const Function &F, const Vocabulary &Vocab)
     : F(F), Vocab(Vocab), Dimension(Vocab.getDimension()),
-      OpcWeight(::OpcWeight), TypeWeight(::TypeWeight), ArgWeight(::ArgWeight) {
-}
+      OpcWeight(::OpcWeight), TypeWeight(::TypeWeight), ArgWeight(::ArgWeight),
+      FuncVector(Embedding(Dimension)) {}
 
 std::unique_ptr<Embedder> Embedder::create(IR2VecKind Mode, const Function &F,
                                            const Vocabulary &Vocab) {
   switch (Mode) {
   case IR2VecKind::Symbolic:
     return std::make_unique<SymbolicEmbedder>(F, Vocab);
+  case IR2VecKind::FlowAware:
+    return std::make_unique<FlowAwareEmbedder>(F, Vocab);
   }
   return nullptr;
 }
@@ -180,6 +195,17 @@ const Embedding &Embedder::getFunctionVector() const {
   return FuncVector;
 }
 
+void Embedder::computeEmbeddings() const {
+  if (F.isDeclaration())
+    return;
+
+  // Consider only the basic blocks that are reachable from entry
+  for (const BasicBlock *BB : depth_first(&F)) {
+    computeEmbeddings(*BB);
+    FuncVector += BBVecMap[BB];
+  }
+}
+
 void SymbolicEmbedder::computeEmbeddings(const BasicBlock &BB) const {
   Embedding BBVector(Dimension, 0);
 
@@ -187,7 +213,7 @@ void SymbolicEmbedder::computeEmbeddings(const BasicBlock &BB) const {
   for (const auto &I : BB.instructionsWithoutDebug()) {
     Embedding ArgEmb(Dimension, 0);
     for (const auto &Op : I.operands())
-      ArgEmb += Vocab[Op];
+      ArgEmb += Vocab[*Op];
     auto InstVector =
         Vocab[I.getOpcode()] + Vocab[I.getType()->getTypeID()] + ArgEmb;
     InstVecMap[&I] = InstVector;
@@ -196,51 +222,75 @@ void SymbolicEmbedder::computeEmbeddings(const BasicBlock &BB) const {
   BBVecMap[&BB] = BBVector;
 }
 
-void SymbolicEmbedder::computeEmbeddings() const {
-  if (F.isDeclaration())
-    return;
+void FlowAwareEmbedder::computeEmbeddings(const BasicBlock &BB) const {
+  Embedding BBVector(Dimension, 0);
 
-  // Consider only the basic blocks that are reachable from entry
-  for (const BasicBlock *BB : depth_first(&F)) {
-    computeEmbeddings(*BB);
-    FuncVector += BBVecMap[BB];
+  // We consider only the non-debug and non-pseudo instructions
+  for (const auto &I : BB.instructionsWithoutDebug()) {
+    // TODO: Handle call instructions differently.
+    // For now, we treat them like other instructions
+    Embedding ArgEmb(Dimension, 0);
+    for (const auto &Op : I.operands()) {
+      // If the operand is defined elsewhere, we use its embedding
+      if (const auto *DefInst = dyn_cast<Instruction>(Op)) {
+        auto DefIt = InstVecMap.find(DefInst);
+        assert(DefIt != InstVecMap.end() &&
+               "Instruction should have been processed before its operands");
+        ArgEmb += DefIt->second;
+        continue;
+      }
+      // If the operand is not defined by an instruction, we use the vocabulary
+      else {
+        LLVM_DEBUG(errs() << "Using embedding from vocabulary for operand: "
+                          << *Op << "=" << Vocab[*Op][0] << "\n");
+        ArgEmb += Vocab[*Op];
+      }
+    }
+    // Create the instruction vector by combining opcode, type, and arguments
+    // embeddings
+    auto InstVector =
+        Vocab[I.getOpcode()] + Vocab[I.getType()->getTypeID()] + ArgEmb;
+    InstVecMap[&I] = InstVector;
+    BBVector += InstVector;
   }
+  BBVecMap[&BB] = BBVector;
 }
 
 // ==----------------------------------------------------------------------===//
 // Vocabulary
 //===----------------------------------------------------------------------===//
 
-Vocabulary::Vocabulary(VocabVector &&Vocab)
-    : Vocab(std::move(Vocab)), Valid(true) {}
+unsigned Vocabulary::getDimension() const {
+  assert(isValid() && "IR2Vec Vocabulary is invalid");
+  return Vocab[0].size();
+}
 
-bool Vocabulary::isValid() const {
-  return Vocab.size() == Vocabulary::expectedSize() && Valid;
+unsigned Vocabulary::getSlotIndex(unsigned Opcode) {
+  assert(Opcode >= 1 && Opcode <= MaxOpcodes && "Invalid opcode");
+  return Opcode - 1; // Convert to zero-based index
 }
 
-size_t Vocabulary::size() const {
-  assert(Valid && "IR2Vec Vocabulary is invalid");
-  return Vocab.size();
+unsigned Vocabulary::getSlotIndex(Type::TypeID TypeID) {
+  assert(static_cast<unsigned>(TypeID) < MaxTypeIDs && "Invalid type ID");
+  return MaxOpcodes + static_cast<unsigned>(getCanonicalTypeID(TypeID));
 }
 
-unsigned Vocabulary::getDimension() const {
-  assert(Valid && "IR2Vec Vocabulary is invalid");
-  return Vocab[0].size();
+unsigned Vocabulary::getSlotIndex(const Value &Op) {
+  unsigned Index = static_cast<unsigned>(getOperandKind(&Op));
+  assert(Index < MaxOperandKinds && "Invalid OperandKind");
+  return MaxOpcodes + MaxCanonicalTypeIDs + Index;
 }
 
 const Embedding &Vocabulary::operator[](unsigned Opcode) const {
-  assert(Opcode >= 1 && Opcode <= MaxOpcodes && "Invalid opcode");
-  return Vocab[Opcode - 1];
+  return Vocab[getSlotIndex(Opcode)];
 }
 
-const Embedding &Vocabulary::operator[](Type::TypeID TypeId) const {
-  assert(static_cast<unsigned>(TypeId) < MaxTypeIDs && "Invalid type ID");
-  return Vocab[MaxOpcodes + static_cast<unsigned>(TypeId)];
+const Embedding &Vocabulary::operator[](Type::TypeID TypeID) const {
+  return Vocab[getSlotIndex(TypeID)];
 }
 
-const ir2vec::Embedding &Vocabulary::operator[](const Value *Arg) const {
-  OperandKind ArgKind = getOperandKind(Arg);
-  return Vocab[MaxOpcodes + MaxTypeIDs + static_cast<unsigned>(ArgKind)];
+const ir2vec::Embedding &Vocabulary::operator[](const Value &Arg) const {
+  return Vocab[getSlotIndex(Arg)];
 }
 
 StringRef Vocabulary::getVocabKeyForOpcode(unsigned Opcode) {
@@ -254,43 +304,21 @@ StringRef Vocabulary::getVocabKeyForOpcode(unsigned Opcode) {
   return "UnknownOpcode";
 }
 
+StringRef Vocabulary::getVocabKeyForCanonicalTypeID(CanonicalTypeID CType) {
+  unsigned Index = static_cast<unsigned>(CType);
+  assert(Index < MaxCanonicalTypeIDs && "Invalid CanonicalTypeID");
+  return CanonicalTypeNames[Index];
+}
+
+Vocabulary::CanonicalTypeID
+Vocabulary::getCanonicalTypeID(Type::TypeID TypeID) {
+  unsigned Index = static_cast<unsigned>(TypeID);
+  assert(Index < MaxTypeIDs && "Invalid TypeID");
+  return TypeIDMapping[Index];
+}
+
 StringRef Vocabulary::getVocabKeyForTypeID(Type::TypeID TypeID) {
-  switch (TypeID) {
-  case Type::VoidTyID:
-    return "VoidTy";
-  case Type::HalfTyID:
-  case Type::BFloatTyID:
-  case Type::FloatTyID:
-  case Type::DoubleTyID:
-  case Type::X86_FP80TyID:
-  case Type::FP128TyID:
-  case Type::PPC_FP128TyID:
-    return "FloatTy";
-  case Type::IntegerTyID:
-    return "IntegerTy";
-  case Type::FunctionTyID:
-    return "FunctionTy";
-  case Type::StructTyID:
-    return "StructTy";
-  case Type::ArrayTyID:
-    return "ArrayTy";
-  case Type::PointerTyID:
-  case Type::TypedPointerTyID:
-    return "PointerTy";
-  case Type::FixedVectorTyID:
-  case Type::ScalableVectorTyID:
-    return "VectorTy";
-  case Type::LabelTyID:
-    return "LabelTy";
-  case Type::TokenTyID:
-    return "TokenTy";
-  case Type::MetadataTyID:
-    return "MetadataTy";
-  case Type::X86_AMXTyID:
-  case Type::TargetExtTyID:
-    return "UnknownTy";
-  }
-  return "UnknownTy";
+  return getVocabKeyForCanonicalTypeID(getCanonicalTypeID(TypeID));
 }
 
 StringRef Vocabulary::getVocabKeyForOperandKind(Vocabulary::OperandKind Kind) {
@@ -299,20 +327,6 @@ StringRef Vocabulary::getVocabKeyForOperandKind(Vocabulary::OperandKind Kind) {
   return OperandKindNames[Index];
 }
 
-Vocabulary::VocabVector Vocabulary::createDummyVocabForTest(unsigned Dim) {
-  VocabVector DummyVocab;
-  float DummyVal = 0.1f;
-  // Create a dummy vocabulary with entries for all opcodes, types, and
-  // operand
-  for ([[maybe_unused]] unsigned _ :
-       seq(0u, Vocabulary::MaxOpcodes + Vocabulary::MaxTypeIDs +
-                   Vocabulary::MaxOperandKinds)) {
-    DummyVocab.push_back(Embedding(Dim, DummyVal));
-    DummyVal += 0.1f;
-  }
-  return DummyVocab;
-}
-
 // Helper function to classify an operand into OperandKind
 Vocabulary::OperandKind Vocabulary::getOperandKind(const Value *Op) {
   if (isa<Function>(Op))
@@ -324,34 +338,18 @@ Vocabulary::OperandKind Vocabulary::getOperandKind(const Value *Op) {
   return OperandKind::VariableID;
 }
 
-unsigned Vocabulary::getNumericID(unsigned Opcode) {
-  assert(Opcode >= 1 && Opcode <= MaxOpcodes && "Invalid opcode");
-  return Opcode - 1; // Convert to zero-based index
-}
-
-unsigned Vocabulary::getNumericID(Type::TypeID TypeID) {
-  assert(static_cast<unsigned>(TypeID) < MaxTypeIDs && "Invalid type ID");
-  return MaxOpcodes + static_cast<unsigned>(TypeID);
-}
-
-unsigned Vocabulary::getNumericID(const Value *Op) {
-  unsigned Index = static_cast<unsigned>(getOperandKind(Op));
-  assert(Index < MaxOperandKinds && "Invalid OperandKind");
-  return MaxOpcodes + MaxTypeIDs + Index;
-}
-
 StringRef Vocabulary::getStringKey(unsigned Pos) {
-  assert(Pos < Vocabulary::expectedSize() &&
-         "Position out of bounds in vocabulary");
+  assert(Pos < NumCanonicalEntries && "Position out of bounds in vocabulary");
   // Opcode
   if (Pos < MaxOpcodes)
     return getVocabKeyForOpcode(Pos + 1);
   // Type
-  if (Pos < MaxOpcodes + MaxTypeIDs)
-    return getVocabKeyForTypeID(static_cast<Type::TypeID>(Pos - MaxOpcodes));
+  if (Pos < MaxOpcodes + MaxCanonicalTypeIDs)
+    return getVocabKeyForCanonicalTypeID(
+        static_cast<CanonicalTypeID>(Pos - MaxOpcodes));
   // Operand
   return getVocabKeyForOperandKind(
-      static_cast<OperandKind>(Pos - MaxOpcodes - MaxTypeIDs));
+      static_cast<OperandKind>(Pos - MaxOpcodes - MaxCanonicalTypeIDs));
 }
 
 // For now, assume vocabulary is stable unless explicitly invalidated.
@@ -361,6 +359,21 @@ bool Vocabulary::invalidate(Module &M, const PreservedAnalyses &PA,
   return !(PAC.preservedWhenStateless());
 }
 
+Vocabulary::VocabVector Vocabulary::createDummyVocabForTest(unsigned Dim) {
+  VocabVector DummyVocab;
+  DummyVocab.reserve(NumCanonicalEntries);
+  float DummyVal = 0.1f;
+  // Create a dummy vocabulary with entries for all opcodes, types, and
+  // operands
+  for ([[maybe_unused]] unsigned _ :
+       seq(0u, Vocabulary::MaxOpcodes + Vocabulary::MaxCanonicalTypeIDs +
+                   Vocabulary::MaxOperandKinds)) {
+    DummyVocab.push_back(Embedding(Dim, DummyVal));
+    DummyVal += 0.1f;
+  }
+  return DummyVocab;
+}
+
 // ==----------------------------------------------------------------------===//
 // IR2VecVocabAnalysis
 //===----------------------------------------------------------------------===//
@@ -452,7 +465,8 @@ void IR2VecVocabAnalysis::generateNumMappedVocab() {
 
   // Handle Opcodes
   std::vector<Embedding> NumericOpcodeEmbeddings(Vocabulary::MaxOpcodes,
-                                                 Embedding(Dim, 0));
+                                                 Embedding(Dim));
+  NumericOpcodeEmbeddings.reserve(Vocabulary::MaxOpcodes);
   for (unsigned Opcode : seq(0u, Vocabulary::MaxOpcodes)) {
     StringRef VocabKey = Vocabulary::getVocabKeyForOpcode(Opcode + 1);
     auto It = OpcVocab.find(VocabKey.str());
@@ -464,14 +478,15 @@ void IR2VecVocabAnalysis::generateNumMappedVocab() {
   Vocab.insert(Vocab.end(), NumericOpcodeEmbeddings.begin(),
                NumericOpcodeEmbeddings.end());
 
-  // Handle Types
-  std::vector<Embedding> NumericTypeEmbeddings(Vocabulary::MaxTypeIDs,
-                                               Embedding(Dim, 0));
-  for (unsigned TypeID : seq(0u, Vocabulary::MaxTypeIDs)) {
-    StringRef VocabKey =
-        Vocabulary::getVocabKeyForTypeID(static_cast<Type::TypeID>(TypeID));
+  // Handle Types - only canonical types are present in vocabulary
+  std::vector<Embedding> NumericTypeEmbeddings(Vocabulary::MaxCanonicalTypeIDs,
+                                               Embedding(Dim));
+  NumericTypeEmbeddings.reserve(Vocabulary::MaxCanonicalTypeIDs);
+  for (unsigned CTypeID : seq(0u, Vocabulary::MaxCanonicalTypeIDs)) {
+    StringRef VocabKey = Vocabulary::getVocabKeyForCanonicalTypeID(
+        static_cast<Vocabulary::CanonicalTypeID>(CTypeID));
     if (auto It = TypeVocab.find(VocabKey.str()); It != TypeVocab.end()) {
-      NumericTypeEmbeddings[TypeID] = It->second;
+      NumericTypeEmbeddings[CTypeID] = It->second;
       continue;
     }
     handleMissingEntity(VocabKey.str());
@@ -481,7 +496,8 @@ void IR2VecVocabAnalysis::generateNumMappedVocab() {
 
   // Handle Arguments/Operands
   std::vector<Embedding> NumericArgEmbeddings(Vocabulary::MaxOperandKinds,
-                                              Embedding(Dim, 0));
+                                              Embedding(Dim));
+  NumericArgEmbeddings.reserve(Vocabulary::MaxOperandKinds);
   for (unsigned OpKind : seq(0u, Vocabulary::MaxOperandKinds)) {
     Vocabulary::OperandKind Kind = static_cast<Vocabulary::OperandKind>(OpKind);
     StringRef VocabKey = Vocabulary::getVocabKeyForOperandKind(Kind);
@@ -552,8 +568,7 @@ PreservedAnalyses IR2VecPrinterPass::run(Module &M,
   assert(Vocabulary.isValid() && "IR2Vec Vocabulary is invalid");
 
   for (Function &F : M) {
-    std::unique_ptr<Embedder> Emb =
-        Embedder::create(IR2VecKind::Symbolic, F, Vocabulary);
+    auto Emb = Embedder::create(IR2VecEmbeddingKind, F, Vocabulary);
     if (!Emb) {
       OS << "Error creating IR2Vec embeddings \n";
       continue;
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 5907e2106533..ebe329aa1d5f 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -5242,6 +5242,19 @@ static Value *simplifyExtractValueInst(Value *Agg, ArrayRef<unsigned> Idxs,
     }
   }
 
+  // Simplify umul_with_overflow where one operand is 1.
+  Value *V;
+  if (Idxs.size() == 1 &&
+      (match(Agg,
+             m_Intrinsic<Intrinsic::umul_with_overflow>(m_Value(V), m_One())) ||
+       match(Agg, m_Intrinsic<Intrinsic::umul_with_overflow>(m_One(),
+                                                             m_Value(V))))) {
+    if (Idxs[0] == 0)
+      return V;
+    assert(Idxs[0] == 1 && "invalid index");
+    return getFalse(CmpInst::makeCmpResultType(V->getType()));
+  }
+
   return nullptr;
 }
 
diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index c7b0ca97a8e4..90bae77bcf70 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -1493,6 +1493,24 @@ LazyValueInfoImpl::getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
             //   br %Condition, label %then, label %else
             APInt ConditionVal(1, isTrueDest ? 1 : 0);
             Result = constantFoldUser(Usr, Condition, ConditionVal, DL);
+          } else if (isa<TruncInst, ZExtInst, SExtInst>(Usr)) {
+            ValueLatticeElement OpLatticeVal =
+                *getValueFromCondition(Usr->getOperand(0), Condition,
+                                       isTrueDest, /*UseBlockValue*/ false);
+
+            if (!OpLatticeVal.isConstantRange())
+              return OpLatticeVal;
+
+            const unsigned ResultBitWidth =
+                Usr->getType()->getScalarSizeInBits();
+            if (auto *Trunc = dyn_cast<TruncInst>(Usr))
+              return ValueLatticeElement::getRange(
+                  OpLatticeVal.getConstantRange().truncate(
+                      ResultBitWidth, Trunc->getNoWrapKind()));
+
+            return ValueLatticeElement::getRange(
+                OpLatticeVal.getConstantRange().castOp(
+                    cast<CastInst>(Usr)->getOpcode(), ResultBitWidth));
           } else {
             // If one of Val's operand has an inferred value, we may be able to
             // infer the value of Val.
diff --git a/llvm/lib/Analysis/Lint.cpp b/llvm/lib/Analysis/Lint.cpp
index 1168005f48c0..32a4264c0343 100644
--- a/llvm/lib/Analysis/Lint.cpp
+++ b/llvm/lib/Analysis/Lint.cpp
@@ -374,13 +374,6 @@ void Lint::visitCallBase(CallBase &I) {
       visitMemoryReference(I, MemoryLocation::getForArgument(&I, 0, TLI),
                            std::nullopt, nullptr, MemRef::Read | MemRef::Write);
       break;
-    case Intrinsic::get_active_lane_mask:
-      if (auto *TripCount = dyn_cast<ConstantInt>(I.getArgOperand(1)))
-        Check(!TripCount->isZero(),
-              "get_active_lane_mask: operand #2 "
-              "must be greater than 0",
-              &I);
-      break;
     }
 }
 
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 9a2c9ba63ec7..0c4e3a2e3b23 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Operator.h"
 
@@ -331,17 +332,10 @@ bool llvm::isDereferenceableAndAlignedInLoop(
                             : SE.getBackedgeTakenCount(L);
   if (isa<SCEVCouldNotCompute>(MaxBECount))
     return false;
-
-  if (isa<SCEVCouldNotCompute>(BECount)) {
-    // TODO: Support symbolic max backedge taken counts for loops without
-    // computable backedge taken counts.
-    MaxBECount =
-        Predicates
-            ? SE.getPredicatedConstantMaxBackedgeTakenCount(L, *Predicates)
-            : SE.getConstantMaxBackedgeTakenCount(L);
-  }
-  const auto &[AccessStart, AccessEnd] = getStartAndEndForAccess(
-      L, PtrScev, LI->getType(), BECount, MaxBECount, &SE, nullptr, &DT, AC);
+  std::optional<ScalarEvolution::LoopGuards> LoopGuards;
+  const auto &[AccessStart, AccessEnd] =
+      getStartAndEndForAccess(L, PtrScev, LI->getType(), BECount, MaxBECount,
+                              &SE, nullptr, &DT, AC, LoopGuards);
   if (isa<SCEVCouldNotCompute>(AccessStart) ||
       isa<SCEVCouldNotCompute>(AccessEnd))
     return false;
@@ -350,7 +344,13 @@ bool llvm::isDereferenceableAndAlignedInLoop(
   const SCEV *PtrDiff = SE.getMinusSCEV(AccessEnd, AccessStart);
   if (isa<SCEVCouldNotCompute>(PtrDiff))
     return false;
-  APInt MaxPtrDiff = SE.getUnsignedRangeMax(PtrDiff);
+
+  if (!LoopGuards)
+    LoopGuards.emplace(
+        ScalarEvolution::LoopGuards::collect(AddRec->getLoop(), SE));
+
+  APInt MaxPtrDiff =
+      SE.getUnsignedRangeMax(SE.applyLoopGuards(PtrDiff, *LoopGuards));
 
   Value *Base = nullptr;
   APInt AccessSize;
@@ -381,7 +381,10 @@ bool llvm::isDereferenceableAndAlignedInLoop(
     if (Offset->getAPInt().urem(Alignment.value()) != 0)
       return false;
 
-    AccessSize = MaxPtrDiff + Offset->getAPInt();
+    bool Overflow = false;
+    AccessSize = MaxPtrDiff.uadd_ov(Offset->getAPInt(), Overflow);
+    if (Overflow)
+      return false;
     AccessSizeSCEV = SE.getAddExpr(PtrDiff, Offset);
     Base = NewBase->getValue();
   } else
@@ -390,9 +393,11 @@ bool llvm::isDereferenceableAndAlignedInLoop(
   Instruction *HeaderFirstNonPHI = &*L->getHeader()->getFirstNonPHIIt();
   return isDereferenceableAndAlignedPointerViaAssumption(
              Base, Alignment,
-             [&SE, AccessSizeSCEV](const RetainedKnowledge &RK) {
-               return SE.isKnownPredicate(CmpInst::ICMP_ULE, AccessSizeSCEV,
-                                          SE.getSCEV(RK.IRArgValue));
+             [&SE, AccessSizeSCEV, &LoopGuards](const RetainedKnowledge &RK) {
+               return SE.isKnownPredicate(
+                   CmpInst::ICMP_ULE,
+                   SE.applyLoopGuards(AccessSizeSCEV, *LoopGuards),
+                   SE.applyLoopGuards(SE.getSCEV(RK.IRArgValue), *LoopGuards));
              },
              DL, HeaderFirstNonPHI, AC, &DT) ||
          isDereferenceableAndAlignedPointer(Base, Alignment, AccessSize, DL,
@@ -855,17 +860,83 @@ bool llvm::canReplacePointersIfEqual(const Value *From, const Value *To,
   return isPointerAlwaysReplaceable(From, To, DL);
 }
 
-bool llvm::isDereferenceableReadOnlyLoop(
+bool llvm::isReadOnlyLoop(
     Loop *L, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
+    SmallVectorImpl<LoadInst *> &NonDereferenceableAndAlignedLoads,
     SmallVectorImpl<const SCEVPredicate *> *Predicates) {
   for (BasicBlock *BB : L->blocks()) {
     for (Instruction &I : *BB) {
       if (auto *LI = dyn_cast<LoadInst>(&I)) {
         if (!isDereferenceableAndAlignedInLoop(LI, L, *SE, *DT, AC, Predicates))
-          return false;
-      } else if (I.mayReadFromMemory() || I.mayWriteToMemory() || I.mayThrow())
+          NonDereferenceableAndAlignedLoads.push_back(LI);
+      } else if (I.mayReadFromMemory() || I.mayWriteToMemory() ||
+                 I.mayThrow()) {
         return false;
+      }
     }
   }
   return true;
 }
+
+LinearExpression llvm::decomposeLinearExpression(const DataLayout &DL,
+                                                 Value *Ptr) {
+  assert(Ptr->getType()->isPointerTy() && "Must be called with pointer arg");
+
+  unsigned BitWidth = DL.getIndexTypeSizeInBits(Ptr->getType());
+  LinearExpression Expr(Ptr, BitWidth);
+
+  while (true) {
+    auto *GEP = dyn_cast<GEPOperator>(Expr.BasePtr);
+    if (!GEP || GEP->getSourceElementType()->isScalableTy())
+      return Expr;
+
+    Value *VarIndex = nullptr;
+    for (Value *Index : GEP->indices()) {
+      if (isa<ConstantInt>(Index))
+        continue;
+      // Only allow a single variable index. We do not bother to handle the
+      // case of the same variable index appearing multiple times.
+      if (Expr.Index || VarIndex)
+        return Expr;
+      VarIndex = Index;
+    }
+
+    // Don't return non-canonical indexes.
+    if (VarIndex && !VarIndex->getType()->isIntegerTy(BitWidth))
+      return Expr;
+
+    // We have verified that we can fully handle this GEP, so we can update Expr
+    // members past this point.
+    Expr.BasePtr = GEP->getPointerOperand();
+    Expr.Flags = Expr.Flags.intersectForOffsetAdd(GEP->getNoWrapFlags());
+    for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
+         GTI != GTE; ++GTI) {
+      Value *Index = GTI.getOperand();
+      if (auto *ConstOffset = dyn_cast<ConstantInt>(Index)) {
+        if (ConstOffset->isZero())
+          continue;
+        if (StructType *STy = GTI.getStructTypeOrNull()) {
+          unsigned ElementIdx = ConstOffset->getZExtValue();
+          const StructLayout *SL = DL.getStructLayout(STy);
+          Expr.Offset += SL->getElementOffset(ElementIdx);
+          continue;
+        }
+        // Truncate if type size exceeds index space.
+        APInt IndexedSize(BitWidth, GTI.getSequentialElementStride(DL),
+                          /*isSigned=*/false,
+                          /*implcitTrunc=*/true);
+        Expr.Offset += ConstOffset->getValue() * IndexedSize;
+        continue;
+      }
+
+      // FIXME: Also look through a mul/shl in the index.
+      assert(Expr.Index == nullptr && "Shouldn't have index yet");
+      Expr.Index = Index;
+      // Truncate if type size exceeds index space.
+      Expr.Scale = APInt(BitWidth, GTI.getSequentialElementStride(DL),
+                         /*isSigned=*/false, /*implicitTrunc=*/true);
+    }
+  }
+
+  return Expr;
+}
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index bceddd032527..87fae92977cd 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -210,11 +210,11 @@ static const SCEV *mulSCEVOverflow(const SCEV *A, const SCEV *B,
 
 /// Return true, if evaluating \p AR at \p MaxBTC cannot wrap, because \p AR at
 /// \p MaxBTC is guaranteed inbounds of the accessed object.
-static bool
-evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR,
-                                     const SCEV *MaxBTC, const SCEV *EltSize,
-                                     ScalarEvolution &SE, const DataLayout &DL,
-                                     DominatorTree *DT, AssumptionCache *AC) {
+static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(
+    const SCEVAddRecExpr *AR, const SCEV *MaxBTC, const SCEV *EltSize,
+    ScalarEvolution &SE, const DataLayout &DL, DominatorTree *DT,
+    AssumptionCache *AC,
+    std::optional<ScalarEvolution::LoopGuards> &LoopGuards) {
   auto *PointerBase = SE.getPointerBase(AR->getStart());
   auto *StartPtr = dyn_cast<SCEVUnknown>(PointerBase);
   if (!StartPtr)
@@ -238,8 +238,8 @@ evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR,
         StartPtrV, {Attribute::Dereferenceable}, *AC,
         L->getLoopPredecessor()->getTerminator(), DT);
     if (DerefRK) {
-      DerefBytesSCEV = SE.getUMaxExpr(
-          DerefBytesSCEV, SE.getConstant(WiderTy, DerefRK.ArgValue));
+      DerefBytesSCEV =
+          SE.getUMaxExpr(DerefBytesSCEV, SE.getSCEV(DerefRK.IRArgValue));
     }
   }
 
@@ -259,10 +259,25 @@ evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR,
   const SCEV *StartOffset = SE.getNoopOrZeroExtend(
       SE.getMinusSCEV(AR->getStart(), StartPtr), WiderTy);
 
+  if (!LoopGuards)
+    LoopGuards.emplace(ScalarEvolution::LoopGuards::collect(AR->getLoop(), SE));
+  MaxBTC = SE.applyLoopGuards(MaxBTC, *LoopGuards);
+
   const SCEV *OffsetAtLastIter =
       mulSCEVOverflow(MaxBTC, SE.getAbsExpr(Step, /*IsNSW=*/false), SE);
-  if (!OffsetAtLastIter)
-    return false;
+  if (!OffsetAtLastIter) {
+    // Re-try with constant max backedge-taken count if using the symbolic one
+    // failed.
+    MaxBTC = SE.getConstantMaxBackedgeTakenCount(AR->getLoop());
+    if (isa<SCEVCouldNotCompute>(MaxBTC))
+      return false;
+    MaxBTC = SE.getNoopOrZeroExtend(
+        MaxBTC, WiderTy);
+    OffsetAtLastIter =
+        mulSCEVOverflow(MaxBTC, SE.getAbsExpr(Step, /*IsNSW=*/false), SE);
+    if (!OffsetAtLastIter)
+      return false;
+  }
 
   const SCEV *OffsetEndBytes = addSCEVNoOverflow(
       OffsetAtLastIter, SE.getNoopOrZeroExtend(EltSize, WiderTy), SE);
@@ -276,6 +291,8 @@ evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR,
     const SCEV *EndBytes = addSCEVNoOverflow(StartOffset, OffsetEndBytes, SE);
     if (!EndBytes)
       return false;
+
+    DerefBytesSCEV = SE.applyLoopGuards(DerefBytesSCEV, *LoopGuards);
     return SE.isKnownPredicate(CmpInst::ICMP_ULE, EndBytes, DerefBytesSCEV);
   }
 
@@ -292,7 +309,8 @@ std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess(
     const SCEV *MaxBTC, ScalarEvolution *SE,
     DenseMap<std::pair<const SCEV *, Type *>,
              std::pair<const SCEV *, const SCEV *>> *PointerBounds,
-    DominatorTree *DT, AssumptionCache *AC) {
+    DominatorTree *DT, AssumptionCache *AC,
+    std::optional<ScalarEvolution::LoopGuards> &LoopGuards) {
   std::pair<const SCEV *, const SCEV *> *PtrBoundsPair;
   if (PointerBounds) {
     auto [Iter, Ins] = PointerBounds->insert(
@@ -328,7 +346,7 @@ std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess(
       // separately checks that accesses cannot not wrap, so unsigned max
       // represents an upper bound.
       if (evaluatePtrAddRecAtMaxBTCWillNotWrap(AR, MaxBTC, EltSizeSCEV, *SE, DL,
-                                               DT, AC)) {
+                                               DT, AC, LoopGuards)) {
         ScEnd = AR->evaluateAtIteration(MaxBTC, *SE);
       } else {
         ScEnd = SE->getAddExpr(
@@ -377,7 +395,7 @@ void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr,
   const SCEV *BTC = PSE.getBackedgeTakenCount();
   const auto &[ScStart, ScEnd] = getStartAndEndForAccess(
       Lp, PtrExpr, AccessTy, BTC, SymbolicMaxBTC, PSE.getSE(),
-      &DC.getPointerBounds(), DC.getDT(), DC.getAC());
+      &DC.getPointerBounds(), DC.getDT(), DC.getAC(), LoopGuards);
   assert(!isa<SCEVCouldNotCompute>(ScStart) &&
          !isa<SCEVCouldNotCompute>(ScEnd) &&
          "must be able to compute both start and end expressions");
@@ -1213,48 +1231,40 @@ static void findForkedSCEVs(
   }
 }
 
-static SmallVector<PointerIntPair<const SCEV *, 1, bool>>
-findForkedPointer(PredicatedScalarEvolution &PSE,
-                  const DenseMap<Value *, const SCEV *> &StridesMap, Value *Ptr,
-                  const Loop *L) {
-  ScalarEvolution *SE = PSE.getSE();
-  assert(SE->isSCEVable(Ptr->getType()) && "Value is not SCEVable!");
-  SmallVector<PointerIntPair<const SCEV *, 1, bool>> Scevs;
-  findForkedSCEVs(SE, L, Ptr, Scevs, MaxForkedSCEVDepth);
-
-  // For now, we will only accept a forked pointer with two possible SCEVs
-  // that are either SCEVAddRecExprs or loop invariant.
-  if (Scevs.size() == 2 &&
-      (isa<SCEVAddRecExpr>(get<0>(Scevs[0])) ||
-       SE->isLoopInvariant(get<0>(Scevs[0]), L)) &&
-      (isa<SCEVAddRecExpr>(get<0>(Scevs[1])) ||
-       SE->isLoopInvariant(get<0>(Scevs[1]), L))) {
-    LLVM_DEBUG(dbgs() << "LAA: Found forked pointer: " << *Ptr << "\n");
-    LLVM_DEBUG(dbgs() << "\t(1) " << *get<0>(Scevs[0]) << "\n");
-    LLVM_DEBUG(dbgs() << "\t(2) " << *get<0>(Scevs[1]) << "\n");
-    return Scevs;
-  }
-
-  return {{replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr), false}};
-}
-
 bool AccessAnalysis::createCheckForAccess(
     RuntimePointerChecking &RtCheck, MemAccessInfo Access, Type *AccessTy,
     const DenseMap<Value *, const SCEV *> &StridesMap,
     DenseMap<Value *, unsigned> &DepSetId, Loop *TheLoop,
     unsigned &RunningDepId, unsigned ASId, bool Assume) {
   Value *Ptr = Access.getPointer();
+  ScalarEvolution *SE = PSE.getSE();
+  assert(SE->isSCEVable(Ptr->getType()) && "Value is not SCEVable!");
 
-  SmallVector<PointerIntPair<const SCEV *, 1, bool>> TranslatedPtrs =
-      findForkedPointer(PSE, StridesMap, Ptr, TheLoop);
-  assert(!TranslatedPtrs.empty() && "must have some translated pointers");
+  SmallVector<PointerIntPair<const SCEV *, 1, bool>> RTCheckPtrs;
+  findForkedSCEVs(SE, TheLoop, Ptr, RTCheckPtrs, MaxForkedSCEVDepth);
+  assert(!RTCheckPtrs.empty() &&
+         "Must have some runtime-check pointer candidates");
+
+  // RTCheckPtrs must have size 2 if there are forked pointers. Otherwise, there
+  // are no forked pointers; replaceSymbolicStridesSCEV in this case.
+  auto IsLoopInvariantOrAR =
+      [&SE, &TheLoop](const PointerIntPair<const SCEV *, 1, bool> &P) {
+        return SE->isLoopInvariant(P.getPointer(), TheLoop) ||
+               isa<SCEVAddRecExpr>(P.getPointer());
+      };
+  if (RTCheckPtrs.size() == 2 && all_of(RTCheckPtrs, IsLoopInvariantOrAR)) {
+    LLVM_DEBUG(dbgs() << "LAA: Found forked pointer: " << *Ptr << "\n";
+               for (const auto &[Idx, Q] : enumerate(RTCheckPtrs)) dbgs()
+               << "\t(" << Idx << ") " << *Q.getPointer() << "\n");
+  } else {
+    RTCheckPtrs = {{replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr), false}};
+  }
 
   /// Check whether all pointers can participate in a runtime bounds check. They
-  /// must either be invariant or AddRecs. If ShouldCheckWrap is true, they also
-  /// must not wrap.
-  for (auto &P : TranslatedPtrs) {
+  /// must either be invariant or non-wrapping affine AddRecs.
+  for (auto &P : RTCheckPtrs) {
     // The bounds for loop-invariant pointer is trivial.
-    if (PSE.getSE()->isLoopInvariant(P.getPointer(), TheLoop))
+    if (SE->isLoopInvariant(P.getPointer(), TheLoop))
       continue;
 
     const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(P.getPointer());
@@ -1265,21 +1275,18 @@ bool AccessAnalysis::createCheckForAccess(
 
     // If there's only one option for Ptr, look it up after bounds and wrap
     // checking, because assumptions might have been added to PSE.
-    if (TranslatedPtrs.size() == 1) {
+    if (RTCheckPtrs.size() == 1) {
       AR =
           cast<SCEVAddRecExpr>(replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr));
       P.setPointer(AR);
     }
 
-    // When we run after a failing dependency check we have to make sure
-    // we don't have wrapping pointers.
-    if (!isNoWrap(PSE, AR, TranslatedPtrs.size() == 1 ? Ptr : nullptr, AccessTy,
-                  TheLoop, Assume)) {
+    if (!isNoWrap(PSE, AR, RTCheckPtrs.size() == 1 ? Ptr : nullptr, AccessTy,
+                  TheLoop, Assume))
       return false;
-    }
   }
 
-  for (auto [PtrExpr, NeedsFreeze] : TranslatedPtrs) {
+  for (const auto &[PtrExpr, NeedsFreeze] : RTCheckPtrs) {
     // The id of the dependence set.
     unsigned DepId;
 
@@ -1983,13 +1990,13 @@ bool MemoryDepChecker::areAccessesCompletelyBeforeOrAfter(const SCEV *Src,
   ScalarEvolution &SE = *PSE.getSE();
   const auto &[SrcStart_, SrcEnd_] =
       getStartAndEndForAccess(InnermostLoop, Src, SrcTy, BTC, SymbolicMaxBTC,
-                              &SE, &PointerBounds, DT, AC);
+                              &SE, &PointerBounds, DT, AC, LoopGuards);
   if (isa<SCEVCouldNotCompute>(SrcStart_) || isa<SCEVCouldNotCompute>(SrcEnd_))
     return false;
 
   const auto &[SinkStart_, SinkEnd_] =
       getStartAndEndForAccess(InnermostLoop, Sink, SinkTy, BTC, SymbolicMaxBTC,
-                              &SE, &PointerBounds, DT, AC);
+                              &SE, &PointerBounds, DT, AC, LoopGuards);
   if (isa<SCEVCouldNotCompute>(SinkStart_) ||
       isa<SCEVCouldNotCompute>(SinkEnd_))
     return false;
@@ -3036,8 +3043,9 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
         TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) * 2;
 
   DepChecker = std::make_unique<MemoryDepChecker>(
-      *PSE, AC, DT, L, SymbolicStrides, MaxTargetVectorWidthInBits);
-  PtrRtChecking = std::make_unique<RuntimePointerChecking>(*DepChecker, SE);
+      *PSE, AC, DT, L, SymbolicStrides, MaxTargetVectorWidthInBits, LoopGuards);
+  PtrRtChecking =
+      std::make_unique<RuntimePointerChecking>(*DepChecker, SE, LoopGuards);
   if (canAnalyzeLoop())
     CanVecMem = analyzeLoop(AA, LI, TLI, DT);
 }
diff --git a/llvm/lib/Analysis/MLInlineAdvisor.cpp b/llvm/lib/Analysis/MLInlineAdvisor.cpp
index 8853a13972be..f90717d3085e 100644
--- a/llvm/lib/Analysis/MLInlineAdvisor.cpp
+++ b/llvm/lib/Analysis/MLInlineAdvisor.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/ReleaseModeModelRunner.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/TensorSpec.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Module.h"
@@ -74,21 +75,22 @@ llvm::getReleaseModeAdvisor(Module &M, ModuleAnalysisManager &MAM,
   if (!llvm::isEmbeddedModelEvaluatorValid<CompiledModelType>() &&
       InteractiveChannelBaseName.empty())
     return nullptr;
-  std::unique_ptr<MLModelRunner> AOTRunner;
-  if (InteractiveChannelBaseName.empty())
-    AOTRunner = std::make_unique<ReleaseModeModelRunner<CompiledModelType>>(
-        M.getContext(), FeatureMap, DecisionName,
-        EmbeddedModelRunnerOptions().setModelSelector(ModelSelector));
-  else {
-    auto Features = FeatureMap;
-    if (InteractiveIncludeDefault)
-      Features.push_back(DefaultDecisionSpec);
-    AOTRunner = std::make_unique<InteractiveModelRunner>(
-        M.getContext(), Features, InlineDecisionSpec,
-        InteractiveChannelBaseName + ".out",
-        InteractiveChannelBaseName + ".in");
-  }
-  return std::make_unique<MLInlineAdvisor>(M, MAM, std::move(AOTRunner),
+  auto RunnerFactory = [&](const std::vector<TensorSpec> &InputFeatures)
+      -> std::unique_ptr<MLModelRunner> {
+    std::unique_ptr<MLModelRunner> AOTRunner;
+    if (InteractiveChannelBaseName.empty())
+      AOTRunner = std::make_unique<ReleaseModeModelRunner<CompiledModelType>>(
+          M.getContext(), InputFeatures, DecisionName,
+          EmbeddedModelRunnerOptions().setModelSelector(ModelSelector));
+    else {
+      AOTRunner = std::make_unique<InteractiveModelRunner>(
+          M.getContext(), InputFeatures, InlineDecisionSpec,
+          InteractiveChannelBaseName + ".out",
+          InteractiveChannelBaseName + ".in");
+    }
+    return AOTRunner;
+  };
+  return std::make_unique<MLInlineAdvisor>(M, MAM, RunnerFactory,
                                            GetDefaultAdvice);
 }
 
@@ -106,8 +108,9 @@ static cl::opt<bool> KeepFPICache(
         "For test - keep the ML Inline advisor's FunctionPropertiesInfo cache"),
     cl::init(false));
 
-// clang-format off
-std::vector<TensorSpec> llvm::FeatureMap{
+const std::vector<TensorSpec> &MLInlineAdvisor::getInitialFeatureMap() {
+  // clang-format off
+static std::vector<TensorSpec> FeatureMap{
 #define POPULATE_NAMES(DTYPE, SHAPE, NAME, __) TensorSpec::createSpec<DTYPE>(#NAME, SHAPE),
 // InlineCost features - these must come first
   INLINE_COST_FEATURE_ITERATOR(POPULATE_NAMES)
@@ -116,7 +119,9 @@ std::vector<TensorSpec> llvm::FeatureMap{
   INLINE_FEATURE_ITERATOR(POPULATE_NAMES)
 #undef POPULATE_NAMES
 };
-// clang-format on
+  // clang-format on
+  return FeatureMap;
+}
 
 const char *const llvm::DecisionName = "inlining_decision";
 const TensorSpec llvm::InlineDecisionSpec =
@@ -138,17 +143,17 @@ CallBase *getInlinableCS(Instruction &I) {
 
 MLInlineAdvisor::MLInlineAdvisor(
     Module &M, ModuleAnalysisManager &MAM,
-    std::unique_ptr<MLModelRunner> Runner,
+    std::function<
+        std::unique_ptr<MLModelRunner>(const std::vector<TensorSpec> &)>
+        GetModelRunner,
     std::function<bool(CallBase &)> GetDefaultAdvice)
     : InlineAdvisor(
           M, MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager()),
-      ModelRunner(std::move(Runner)), GetDefaultAdvice(GetDefaultAdvice),
+      GetDefaultAdvice(GetDefaultAdvice), FeatureMap(getInitialFeatureMap()),
       CG(MAM.getResult<LazyCallGraphAnalysis>(M)),
       UseIR2Vec(MAM.getCachedResult<IR2VecVocabAnalysis>(M) != nullptr),
       InitialIRSize(getModuleIRSize()), CurrentIRSize(InitialIRSize),
       PSI(MAM.getResult<ProfileSummaryAnalysis>(M)) {
-  assert(ModelRunner);
-  ModelRunner->switchContext("");
   // Extract the 'call site height' feature - the position of a call site
   // relative to the farthest statically reachable SCC node. We don't mutate
   // this value while inlining happens. Empirically, this feature proved
@@ -188,7 +193,7 @@ MLInlineAdvisor::MLInlineAdvisor(
   }
   NodeCount = AllNodes.size();
 
-  if (auto IR2VecVocabResult = MAM.getCachedResult<IR2VecVocabAnalysis>(M)) {
+  if (auto *IR2VecVocabResult = MAM.getCachedResult<IR2VecVocabAnalysis>(M)) {
     if (!IR2VecVocabResult->isValid()) {
       M.getContext().emitError("IR2VecVocabAnalysis is not valid");
       return;
@@ -200,6 +205,15 @@ MLInlineAdvisor::MLInlineAdvisor(
     FeatureMap.push_back(
         TensorSpec::createSpec<float>("caller_embedding", {IR2VecDim}));
   }
+  if (InteractiveIncludeDefault)
+    FeatureMap.push_back(DefaultDecisionSpec);
+
+  ModelRunner = GetModelRunner(getFeatureMap());
+  if (!ModelRunner) {
+    M.getContext().emitError("Could not create model runner");
+    return;
+  }
+  ModelRunner->switchContext("");
 }
 
 unsigned MLInlineAdvisor::getInitialFunctionLevel(const Function &F) const {
@@ -471,7 +485,8 @@ std::unique_ptr<InlineAdvice> MLInlineAdvisor::getAdviceImpl(CallBase &CB) {
   }
   // This one would have been set up to be right at the end.
   if (!InteractiveChannelBaseName.empty() && InteractiveIncludeDefault)
-    *ModelRunner->getTensor<int64_t>(FeatureMap.size()) = GetDefaultAdvice(CB);
+    *ModelRunner->getTensor<int64_t>(getFeatureMap().size() - 1) =
+        GetDefaultAdvice(CB);
   return getAdviceFromModel(CB, ORE);
 }
 
@@ -549,8 +564,8 @@ void MLInlineAdvice::reportContextForRemark(
     DiagnosticInfoOptimizationBase &OR) {
   using namespace ore;
   OR << NV("Callee", Callee->getName());
-  for (size_t I = 0; I < FeatureMap.size(); ++I)
-    OR << NV(FeatureMap[I].name(),
+  for (size_t I = 0; I < getAdvisor()->getFeatureMap().size(); ++I)
+    OR << NV(getAdvisor()->getFeatureMap()[I].name(),
              *getAdvisor()->getModelRunner().getTensor<int64_t>(I));
   OR << NV("ShouldInline", isInliningRecommended());
 }
diff --git a/llvm/lib/Analysis/MemoryBuiltins.cpp b/llvm/lib/Analysis/MemoryBuiltins.cpp
index e0b7f65d18a3..1df4eda2580d 100644
--- a/llvm/lib/Analysis/MemoryBuiltins.cpp
+++ b/llvm/lib/Analysis/MemoryBuiltins.cpp
@@ -589,6 +589,59 @@ bool llvm::getObjectSize(const Value *Ptr, uint64_t &Size, const DataLayout &DL,
   return true;
 }
 
+std::optional<TypeSize> llvm::getBaseObjectSize(const Value *Ptr,
+                                                const DataLayout &DL,
+                                                const TargetLibraryInfo *TLI,
+                                                ObjectSizeOpts Opts) {
+  assert(Opts.EvalMode == ObjectSizeOpts::Mode::ExactSizeFromOffset &&
+         "Other modes are currently not supported");
+
+  auto Align = [&](TypeSize Size, MaybeAlign Alignment) {
+    if (Opts.RoundToAlign && Alignment && !Size.isScalable())
+      return TypeSize::getFixed(alignTo(Size.getFixedValue(), *Alignment));
+    return Size;
+  };
+
+  if (isa<UndefValue>(Ptr))
+    return TypeSize::getZero();
+
+  if (isa<ConstantPointerNull>(Ptr)) {
+    if (Opts.NullIsUnknownSize || Ptr->getType()->getPointerAddressSpace())
+      return std::nullopt;
+    return TypeSize::getZero();
+  }
+
+  if (auto *GV = dyn_cast<GlobalVariable>(Ptr)) {
+    if (!GV->getValueType()->isSized() || GV->hasExternalWeakLinkage() ||
+        !GV->hasInitializer() || GV->isInterposable())
+      return std::nullopt;
+    return Align(DL.getTypeAllocSize(GV->getValueType()), GV->getAlign());
+  }
+
+  if (auto *A = dyn_cast<Argument>(Ptr)) {
+    Type *MemoryTy = A->getPointeeInMemoryValueType();
+    if (!MemoryTy || !MemoryTy->isSized())
+      return std::nullopt;
+    return Align(DL.getTypeAllocSize(MemoryTy), A->getParamAlign());
+  }
+
+  if (auto *AI = dyn_cast<AllocaInst>(Ptr)) {
+    if (std::optional<TypeSize> Size = AI->getAllocationSize(DL))
+      return Align(*Size, AI->getAlign());
+    return std::nullopt;
+  }
+
+  if (auto *CB = dyn_cast<CallBase>(Ptr)) {
+    if (std::optional<APInt> Size = getAllocSize(CB, TLI)) {
+      if (std::optional<uint64_t> ZExtSize = Size->tryZExtValue())
+        return TypeSize::getFixed(*ZExtSize);
+    }
+    return std::nullopt;
+  }
+
+  return std::nullopt;
+}
+
 Value *llvm::lowerObjectSizeCall(IntrinsicInst *ObjectSize,
                                  const DataLayout &DL,
                                  const TargetLibraryInfo *TLI,
diff --git a/llvm/lib/Analysis/MemoryLocation.cpp b/llvm/lib/Analysis/MemoryLocation.cpp
index 72b643c56a99..dcc51178b975 100644
--- a/llvm/lib/Analysis/MemoryLocation.cpp
+++ b/llvm/lib/Analysis/MemoryLocation.cpp
@@ -12,6 +12,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsARM.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
 #include <optional>
 using namespace llvm;
@@ -150,6 +151,33 @@ MemoryLocation::getForDest(const CallBase *CB, const TargetLibraryInfo &TLI) {
   return MemoryLocation::getBeforeOrAfter(UsedV, CB->getAAMetadata());
 }
 
+// If the mask for a memory op is a get active lane mask intrinsic
+// we can possibly infer the size of memory written or read
+static std::optional<FixedVectorType *>
+getKnownTypeFromMaskedOp(Value *Mask, VectorType *Ty) {
+  using namespace llvm::PatternMatch;
+  ConstantInt *Op0, *Op1;
+  if (!match(Mask, m_Intrinsic<Intrinsic::get_active_lane_mask>(
+                       m_ConstantInt(Op0), m_ConstantInt(Op1))))
+    return std::nullopt;
+
+  APInt LaneMaskLo = Op0->getValue();
+  APInt LaneMaskHi = Op1->getValue();
+  if (LaneMaskHi.ule(LaneMaskLo))
+    return std::nullopt;
+
+  APInt NumElts = LaneMaskHi - LaneMaskLo;
+  if (NumElts.ugt(Ty->getElementCount().getKnownMinValue())) {
+    if (isa<ScalableVectorType>(Ty))
+      return std::nullopt;
+    // Unlike scalable vectors, fixed vector types are guaranteed to handle the
+    // KnownMinValue and can be clamped
+    NumElts = Ty->getElementCount().getKnownMinValue();
+  }
+
+  return FixedVectorType::get(Ty->getElementType(), NumElts.getZExtValue());
+}
+
 MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
                                               unsigned ArgIdx,
                                               const TargetLibraryInfo *TLI) {
@@ -213,20 +241,26 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
               cast<ConstantInt>(II->getArgOperand(0))->getZExtValue()),
           AATags);
 
-    case Intrinsic::masked_load:
+    case Intrinsic::masked_load: {
       assert(ArgIdx == 0 && "Invalid argument index");
-      return MemoryLocation(
-          Arg,
-          LocationSize::upperBound(DL.getTypeStoreSize(II->getType())),
-          AATags);
 
-    case Intrinsic::masked_store:
+      auto *Ty = cast<VectorType>(II->getType());
+      if (auto KnownType = getKnownTypeFromMaskedOp(II->getOperand(2), Ty))
+        return MemoryLocation(Arg, DL.getTypeStoreSize(*KnownType), AATags);
+
+      return MemoryLocation(
+          Arg, LocationSize::upperBound(DL.getTypeStoreSize(Ty)), AATags);
+    }
+    case Intrinsic::masked_store: {
       assert(ArgIdx == 1 && "Invalid argument index");
+
+      auto *Ty = cast<VectorType>(II->getArgOperand(0)->getType());
+      if (auto KnownType = getKnownTypeFromMaskedOp(II->getOperand(3), Ty))
+        return MemoryLocation(Arg, DL.getTypeStoreSize(*KnownType), AATags);
+
       return MemoryLocation(
-          Arg,
-          LocationSize::upperBound(
-              DL.getTypeStoreSize(II->getArgOperand(0)->getType())),
-          AATags);
+          Arg, LocationSize::upperBound(DL.getTypeStoreSize(Ty)), AATags);
+    }
 
     case Intrinsic::invariant_end:
       // The first argument to an invariant.end is a "descriptor" type (e.g. a
diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp
index b3c8a7d4563b..b5ca6b13108f 100644
--- a/llvm/lib/Analysis/MemoryProfileInfo.cpp
+++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp
@@ -121,6 +121,24 @@ bool llvm::memprof::hasSingleAllocType(uint8_t AllocTypes) {
   return NumAllocTypes == 1;
 }
 
+void llvm::memprof::removeAnyExistingAmbiguousAttribute(CallBase *CB) {
+  if (!CB->hasFnAttr("memprof"))
+    return;
+  assert(CB->getFnAttr("memprof").getValueAsString() == "ambiguous");
+  CB->removeFnAttr("memprof");
+}
+
+void llvm::memprof::addAmbiguousAttribute(CallBase *CB) {
+  // We may have an existing ambiguous attribute if we are reanalyzing
+  // after inlining.
+  if (CB->hasFnAttr("memprof")) {
+    assert(CB->getFnAttr("memprof").getValueAsString() == "ambiguous");
+  } else {
+    auto A = llvm::Attribute::get(CB->getContext(), "memprof", "ambiguous");
+    CB->addFnAttr(A);
+  }
+}
+
 void CallStackTrie::addCallStack(
     AllocationType AllocType, ArrayRef<uint64_t> StackIds,
     std::vector<ContextTotalSize> ContextSizeInfo) {
@@ -466,6 +484,9 @@ void CallStackTrie::addSingleAllocTypeAttribute(CallBase *CI, AllocationType AT,
                                                 StringRef Descriptor) {
   auto AllocTypeString = getAllocTypeAttributeString(AT);
   auto A = llvm::Attribute::get(CI->getContext(), "memprof", AllocTypeString);
+  // After inlining we may be able to convert an existing ambiguous allocation
+  // to an unambiguous one.
+  removeAnyExistingAmbiguousAttribute(CI);
   CI->addFnAttr(A);
   if (MemProfReportHintedSizes) {
     std::vector<ContextTotalSize> ContextSizeInfo;
@@ -525,6 +546,7 @@ bool CallStackTrie::buildAndAttachMIBMetadata(CallBase *CI) {
     assert(MIBCallStack.size() == 1 &&
            "Should only be left with Alloc's location in stack");
     CI->setMetadata(LLVMContext::MD_memprof, MDNode::get(Ctx, MIBNodes));
+    addAmbiguousAttribute(CI);
     return true;
   }
   // If there exists corner case that CallStackTrie has one chain to leaf
diff --git a/llvm/lib/Analysis/MemorySSAUpdater.cpp b/llvm/lib/Analysis/MemorySSAUpdater.cpp
index ecfecb03c375..bb3e679219ae 100644
--- a/llvm/lib/Analysis/MemorySSAUpdater.cpp
+++ b/llvm/lib/Analysis/MemorySSAUpdater.cpp
@@ -411,17 +411,11 @@ void MemorySSAUpdater::insertDef(MemoryDef *MD, bool RenameUses) {
     FixupList.push_back(MD);
   }
 
-  // Remember the index where we stopped inserting new phis above, since the
-  // fixupDefs call in the loop below may insert more, that are already minimal.
+  // Update defining access of following defs.
   unsigned NewPhiIndexEnd = InsertedPHIs.size();
-
-  while (!FixupList.empty()) {
-    unsigned StartingPHISize = InsertedPHIs.size();
-    fixupDefs(FixupList);
-    FixupList.clear();
-    // Put any new phis on the fixup list, and process them
-    FixupList.append(InsertedPHIs.begin() + StartingPHISize, InsertedPHIs.end());
-  }
+  fixupDefs(FixupList);
+  assert(NewPhiIndexEnd == InsertedPHIs.size() &&
+         "Should not insert new phis during fixupDefs()");
 
   // Optimize potentially non-minimal phis added in this method.
   unsigned NewPhiSize = NewPhiIndexEnd - NewPhiIndex;
@@ -504,11 +498,8 @@ void MemorySSAUpdater::fixupDefs(const SmallVectorImpl<WeakVH> &Vars) {
         assert(MSSA->dominates(NewDef, FirstDef) &&
                "Should have dominated the new access");
 
-        // This may insert new phi nodes, because we are not guaranteed the
-        // block we are processing has a single pred, and depending where the
-        // store was inserted, it may require phi nodes below it.
-        cast<MemoryDef>(FirstDef)->setDefiningAccess(getPreviousDef(FirstDef));
-        return;
+        cast<MemoryDef>(FirstDef)->setDefiningAccess(NewDef);
+        continue;
       }
       // We didn't find a def, so we must continue.
       for (const auto *S : successors(FixupBlock)) {
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index f60a1e9f2270..51caffc41002 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -500,10 +500,11 @@ const SCEV *ScalarEvolution::getVScale(Type *Ty) {
   return S;
 }
 
-const SCEV *ScalarEvolution::getElementCount(Type *Ty, ElementCount EC) {
+const SCEV *ScalarEvolution::getElementCount(Type *Ty, ElementCount EC,
+                                             SCEV::NoWrapFlags Flags) {
   const SCEV *Res = getConstant(Ty, EC.getKnownMinValue());
   if (EC.isScalable())
-    Res = getMulExpr(Res, getVScale(Ty));
+    Res = getMulExpr(Res, getVScale(Ty), Flags);
   return Res;
 }
 
@@ -3199,6 +3200,37 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
                                AddRec->getNoWrapFlags(FlagsMask));
         }
       }
+
+      // Try to push the constant operand into a ZExt: C * zext (A + B) ->
+      // zext (C*A + C*B) if trunc (C) * (A + B)  does not unsigned-wrap.
+      const SCEVAddExpr *InnerAdd;
+      if (match(Ops[1], m_scev_ZExt(m_scev_Add(InnerAdd)))) {
+        const SCEV *NarrowC = getTruncateExpr(LHSC, InnerAdd->getType());
+        if (isa<SCEVConstant>(InnerAdd->getOperand(0)) &&
+            getZeroExtendExpr(NarrowC, Ops[1]->getType()) == LHSC &&
+            hasFlags(StrengthenNoWrapFlags(this, scMulExpr, {NarrowC, InnerAdd},
+                                           SCEV::FlagAnyWrap),
+                     SCEV::FlagNUW)) {
+          auto *Res = getMulExpr(NarrowC, InnerAdd, SCEV::FlagNUW, Depth + 1);
+          return getZeroExtendExpr(Res, Ops[1]->getType(), Depth + 1);
+        };
+      }
+
+      // Try to fold (C1 * D /u C2) -> C1/C2 * D, if C1 and C2 are powers-of-2,
+      // D is a multiple of C2, and C1 is a multiple of C2.
+      const SCEV *D;
+      APInt C1V = LHSC->getAPInt();
+      // (C1 * D /u C2) == -1 * -C1 * D /u C2 when C1 != INT_MIN.
+      if (C1V.isNegative() && !C1V.isMinSignedValue())
+        C1V = C1V.abs();
+      const SCEVConstant *C2;
+      if (C1V.isPowerOf2() &&
+          match(Ops[1], m_scev_UDiv(m_SCEV(D), m_SCEVConstant(C2))) &&
+          C2->getAPInt().isPowerOf2() && C1V.uge(C2->getAPInt()) &&
+          C1V.logBase2() <= getMinTrailingZeros(D)) {
+        const SCEV *NewMul = getMulExpr(getUDivExpr(getConstant(C1V), C2), D);
+        return C1V == LHSC->getAPInt() ? NewMul : getNegativeSCEV(NewMul);
+      }
     }
   }
 
@@ -15985,6 +16017,16 @@ const SCEV *ScalarEvolution::LoopGuards::rewrite(const SCEV *Expr) const {
     }
 
     const SCEV *visitAddExpr(const SCEVAddExpr *Expr) {
+      // Trip count expressions sometimes consist of adding 3 operands, i.e.
+      // (Const + A + B). There may be guard info for A + B, and if so, apply
+      // it.
+      // TODO: Could more generally apply guards to Add sub-expressions.
+      if (isa<SCEVConstant>(Expr->getOperand(0)) &&
+          Expr->getNumOperands() == 3) {
+        if (const SCEV *S = Map.lookup(
+                SE.getAddExpr(Expr->getOperand(1), Expr->getOperand(2))))
+          return SE.getAddExpr(Expr->getOperand(0), S);
+      }
       SmallVector<const SCEV *, 2> Operands;
       bool Changed = false;
       for (const auto *Op : Expr->operands()) {
diff --git a/llvm/lib/Analysis/ScalarEvolutionDivision.cpp b/llvm/lib/Analysis/ScalarEvolutionDivision.cpp
index d03930d9e2d9..bce41f9f5329 100644
--- a/llvm/lib/Analysis/ScalarEvolutionDivision.cpp
+++ b/llvm/lib/Analysis/ScalarEvolutionDivision.cpp
@@ -15,10 +15,14 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/Support/Casting.h"
 #include <cassert>
 #include <cstdint>
 
+#define DEBUG_TYPE "scev-division"
+
 namespace llvm {
 class Type;
 } // namespace llvm
@@ -257,3 +261,31 @@ void SCEVDivision::cannotDivide(const SCEV *Numerator) {
   Quotient = Zero;
   Remainder = Numerator;
 }
+
+void SCEVDivisionPrinterPass::runImpl(Function &F, ScalarEvolution &SE) {
+  OS << "Printing analysis 'Scalar Evolution Division' for function '"
+     << F.getName() << "':\n";
+  for (Instruction &Inst : instructions(F)) {
+    BinaryOperator *Div = dyn_cast<BinaryOperator>(&Inst);
+    if (!Div || Div->getOpcode() != Instruction::SDiv)
+      continue;
+
+    const SCEV *Numerator = SE.getSCEV(Div->getOperand(0));
+    const SCEV *Denominator = SE.getSCEV(Div->getOperand(1));
+    const SCEV *Quotient, *Remainder;
+    SCEVDivision::divide(SE, Numerator, Denominator, &Quotient, &Remainder);
+
+    OS << "Instruction: " << *Div << "\n";
+    OS.indent(2) << "Numerator: " << *Numerator << "\n";
+    OS.indent(2) << "Denominator: " << *Denominator << "\n";
+    OS.indent(2) << "Quotient: " << *Quotient << "\n";
+    OS.indent(2) << "Remainder: " << *Remainder << "\n";
+  }
+}
+
+PreservedAnalyses SCEVDivisionPrinterPass::run(Function &F,
+                                               FunctionAnalysisManager &AM) {
+  ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+  runImpl(F, SE);
+  return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 4ac8f03e6dbf..899806bf3734 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1283,9 +1283,10 @@ InstructionCost TargetTransformInfo::getExtendedReductionCost(
 }
 
 InstructionCost TargetTransformInfo::getMulAccReductionCost(
-    bool IsUnsigned, Type *ResTy, VectorType *Ty,
+    bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty,
     TTI::TargetCostKind CostKind) const {
-  return TTIImpl->getMulAccReductionCost(IsUnsigned, ResTy, Ty, CostKind);
+  return TTIImpl->getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, Ty,
+                                         CostKind);
 }
 
 InstructionCost
@@ -1402,8 +1403,9 @@ unsigned TargetTransformInfo::getStoreVectorFactor(unsigned VF,
   return TTIImpl->getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
 }
 
-bool TargetTransformInfo::preferFixedOverScalableIfEqualCost() const {
-  return TTIImpl->preferFixedOverScalableIfEqualCost();
+bool TargetTransformInfo::preferFixedOverScalableIfEqualCost(
+    bool IsEpilogue) const {
+  return TTIImpl->preferFixedOverScalableIfEqualCost(IsEpilogue);
 }
 
 bool TargetTransformInfo::preferInLoopReduction(RecurKind Kind,
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 7fe129b8456f..129823e0e98a 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -413,6 +413,18 @@ static void computeKnownBitsMul(const Value *Op0, const Value *Op1, bool NSW,
         isGuaranteedNotToBeUndef(Op0, Q.AC, Q.CxtI, Q.DT, Depth + 1);
   Known = KnownBits::mul(Known, Known2, SelfMultiply);
 
+  if (SelfMultiply) {
+    unsigned SignBits = ComputeNumSignBits(Op0, DemandedElts, Q, Depth + 1);
+    unsigned TyBits = Op0->getType()->getScalarSizeInBits();
+    unsigned OutValidBits = 2 * (TyBits - SignBits + 1);
+
+    if (OutValidBits < TyBits) {
+      APInt KnownZeroMask =
+          APInt::getHighBitsSet(TyBits, TyBits - OutValidBits + 1);
+      Known.Zero |= KnownZeroMask;
+    }
+  }
+
   // Only make use of no-wrap flags if we failed to compute the sign bit
   // directly.  This matters if the multiplication always overflows, in
   // which case we prefer to follow the result of the direct computation,
@@ -727,17 +739,16 @@ static void computeKnownBitsFromCmp(const Value *V, CmpInst::Predicate Pred,
       // For those bits in C that are known, we can propagate them to known
       // bits in V shifted to the right by ShAmt.
       KnownBits RHSKnown = KnownBits::makeConstant(*C);
-      RHSKnown.Zero.lshrInPlace(ShAmt);
-      RHSKnown.One.lshrInPlace(ShAmt);
+      RHSKnown >>= ShAmt;
       Known = Known.unionWith(RHSKnown);
       // assume(V >> ShAmt = C)
     } else if (match(LHS, m_Shr(m_V, m_ConstantInt(ShAmt))) &&
                ShAmt < BitWidth) {
-      KnownBits RHSKnown = KnownBits::makeConstant(*C);
       // For those bits in RHS that are known, we can propagate them to known
       // bits in V shifted to the right by C.
-      Known.Zero |= RHSKnown.Zero << ShAmt;
-      Known.One |= RHSKnown.One << ShAmt;
+      KnownBits RHSKnown = KnownBits::makeConstant(*C);
+      RHSKnown <<= ShAmt;
+      Known = Known.unionWith(RHSKnown);
     }
     break;
   case ICmpInst::ICMP_NE: {
@@ -1829,18 +1840,16 @@ static void computeKnownBitsFromOperator(const Operator *I,
       case Intrinsic::abs: {
         computeKnownBits(I->getOperand(0), DemandedElts, Known2, Q, Depth + 1);
         bool IntMinIsPoison = match(II->getArgOperand(1), m_One());
-        Known = Known2.abs(IntMinIsPoison);
+        Known = Known.unionWith(Known2.abs(IntMinIsPoison));
         break;
       }
       case Intrinsic::bitreverse:
         computeKnownBits(I->getOperand(0), DemandedElts, Known2, Q, Depth + 1);
-        Known.Zero |= Known2.Zero.reverseBits();
-        Known.One |= Known2.One.reverseBits();
+        Known = Known.unionWith(Known2.reverseBits());
         break;
       case Intrinsic::bswap:
         computeKnownBits(I->getOperand(0), DemandedElts, Known2, Q, Depth + 1);
-        Known.Zero |= Known2.Zero.byteSwap();
-        Known.One |= Known2.One.byteSwap();
+        Known = Known.unionWith(Known2.byteSwap());
         break;
       case Intrinsic::ctlz: {
         computeKnownBits(I->getOperand(0), DemandedElts, Known2, Q, Depth + 1);
@@ -1890,10 +1899,9 @@ static void computeKnownBitsFromOperator(const Operator *I,
         computeKnownBits(I->getOperand(0), DemandedElts, Known2, Q, Depth + 1);
         computeKnownBits(I->getOperand(1), DemandedElts, Known3, Q, Depth + 1);
 
-        Known.Zero =
-            Known2.Zero.shl(ShiftAmt) | Known3.Zero.lshr(BitWidth - ShiftAmt);
-        Known.One =
-            Known2.One.shl(ShiftAmt) | Known3.One.lshr(BitWidth - ShiftAmt);
+        Known2 <<= ShiftAmt;
+        Known3 >>= BitWidth - ShiftAmt;
+        Known = Known2.unionWith(Known3);
         break;
       }
       case Intrinsic::uadd_sat:
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 425ea311d653..091d94843698 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -166,6 +166,7 @@ bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
   case Intrinsic::is_fpclass:
   case Intrinsic::vp_is_fpclass:
   case Intrinsic::powi:
+  case Intrinsic::vector_extract:
     return (ScalarOpdIdx == 1);
   case Intrinsic::smul_fix:
   case Intrinsic::smul_fix_sat:
@@ -200,6 +201,7 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
   case Intrinsic::vp_llrint:
   case Intrinsic::ucmp:
   case Intrinsic::scmp:
+  case Intrinsic::vector_extract:
     return OpdIdx == -1 || OpdIdx == 0;
   case Intrinsic::modf:
   case Intrinsic::sincos:
diff --git a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
index 33eed07c4629..8737dc0fc745 100644
--- a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
+++ b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
@@ -77,9 +77,10 @@ bool MetadataVerifier::verifyScalarEntry(
     msgpack::MapDocNode &MapNode, StringRef Key, bool Required,
     msgpack::Type SKind,
     function_ref<bool(msgpack::DocNode &)> verifyValue) {
-  return verifyEntry(MapNode, Key, Required, [=](msgpack::DocNode &Node) {
-    return verifyScalar(Node, SKind, verifyValue);
-  });
+  return verifyEntry(MapNode, Key, Required,
+                     [this, SKind, verifyValue](msgpack::DocNode &Node) {
+                       return verifyScalar(Node, SKind, verifyValue);
+                     });
 }
 
 bool MetadataVerifier::verifyIntegerEntry(msgpack::MapDocNode &MapNode,
diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
index 738e47b8b16c..a5cedadd3098 100644
--- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -43,6 +43,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TimeProfiler.h"
 
 #include <algorithm>
 #include <cassert>
@@ -1052,6 +1053,7 @@ void MetadataLoader::MetadataLoaderImpl::callMDTypeCallback(Metadata **Val,
 /// Parse a METADATA_BLOCK. If ModuleLevel is true then we are parsing
 /// module level metadata.
 Error MetadataLoader::MetadataLoaderImpl::parseMetadata(bool ModuleLevel) {
+  llvm::TimeTraceScope timeScope("Parse metadata");
   if (!ModuleLevel && MetadataList.hasFwdRefs())
     return error("Invalid metadata: fwd refs into function blocks");
 
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index a3f825408d0c..a1d5b36bde64 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -1495,14 +1495,11 @@ void ModuleBitcodeWriter::writeModuleInfo() {
   // compute the maximum alignment value.
   std::map<std::string, unsigned> SectionMap;
   std::map<std::string, unsigned> GCMap;
-  MaybeAlign MaxAlignment;
+  MaybeAlign MaxGVarAlignment;
   unsigned MaxGlobalType = 0;
-  const auto UpdateMaxAlignment = [&MaxAlignment](const MaybeAlign A) {
-    if (A)
-      MaxAlignment = !MaxAlignment ? *A : std::max(*MaxAlignment, *A);
-  };
   for (const GlobalVariable &GV : M.globals()) {
-    UpdateMaxAlignment(GV.getAlign());
+    if (MaybeAlign A = GV.getAlign())
+      MaxGVarAlignment = !MaxGVarAlignment ? *A : std::max(*MaxGVarAlignment, *A);
     MaxGlobalType = std::max(MaxGlobalType, VE.getTypeID(GV.getValueType()));
     if (GV.hasSection()) {
       // Give section names unique ID's.
@@ -1515,7 +1512,6 @@ void ModuleBitcodeWriter::writeModuleInfo() {
     }
   }
   for (const Function &F : M) {
-    UpdateMaxAlignment(F.getAlign());
     if (F.hasSection()) {
       // Give section names unique ID's.
       unsigned &Entry = SectionMap[std::string(F.getSection())];
@@ -1551,10 +1547,10 @@ void ModuleBitcodeWriter::writeModuleInfo() {
                                                            //| constant
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // Initializer.
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 5)); // Linkage.
-    if (!MaxAlignment)                                     // Alignment.
+    if (!MaxGVarAlignment)                                 // Alignment.
       Abbv->Add(BitCodeAbbrevOp(0));
     else {
-      unsigned MaxEncAlignment = getEncodedAlign(MaxAlignment);
+      unsigned MaxEncAlignment = getEncodedAlign(MaxGVarAlignment);
       Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
                                Log2_32_Ceil(MaxEncAlignment+1)));
     }
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 23a3543e9ebe..cd14a4f57f76 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1432,7 +1432,7 @@ void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) {
   MCSection *BBAddrMapSection =
       getObjFileLowering().getBBAddrMapSection(*MF.getSection());
   assert(BBAddrMapSection && ".llvm_bb_addr_map section is not initialized.");
-  bool HasCalls = !CurrentFnCallsiteSymbols.empty();
+  bool HasCalls = !CurrentFnCallsiteEndSymbols.empty();
 
   const MCSymbol *FunctionSymbol = getFunctionBegin();
 
@@ -1497,13 +1497,13 @@ void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) {
       emitLabelDifferenceAsULEB128(MBBSymbol, PrevMBBEndSymbol);
       const MCSymbol *CurrentLabel = MBBSymbol;
       if (HasCalls) {
-        auto CallsiteSymbols = CurrentFnCallsiteSymbols.lookup(&MBB);
+        auto CallsiteEndSymbols = CurrentFnCallsiteEndSymbols.lookup(&MBB);
         OutStreamer->AddComment("number of callsites");
-        OutStreamer->emitULEB128IntValue(CallsiteSymbols.size());
-        for (const MCSymbol *CallsiteSymbol : CallsiteSymbols) {
+        OutStreamer->emitULEB128IntValue(CallsiteEndSymbols.size());
+        for (const MCSymbol *CallsiteEndSymbol : CallsiteEndSymbols) {
           // Emit the callsite offset.
-          emitLabelDifferenceAsULEB128(CallsiteSymbol, CurrentLabel);
-          CurrentLabel = CallsiteSymbol;
+          emitLabelDifferenceAsULEB128(CallsiteEndSymbol, CurrentLabel);
+          CurrentLabel = CallsiteEndSymbol;
         }
       }
       // Emit the offset to the end of the block, which can be used to compute
@@ -1941,8 +1941,6 @@ void AsmPrinter::emitFunctionBody() {
           !MI.isDebugInstr()) {
         HasAnyRealCode = true;
       }
-      if (MI.isCall() && MF->getTarget().Options.BBAddrMap)
-        OutStreamer->emitLabel(createCallsiteSymbol(MBB));
 
       // If there is a pre-instruction symbol, emit a label for it here.
       if (MCSymbol *S = MI.getPreInstrSymbol())
@@ -2064,6 +2062,9 @@ void AsmPrinter::emitFunctionBody() {
         break;
       }
 
+      if (MI.isCall() && MF->getTarget().Options.BBAddrMap)
+        OutStreamer->emitLabel(createCallsiteEndSymbol(MBB));
+
       if (TM.Options.EmitCallGraphSection && MI.isCall())
         emitIndirectCalleeLabels(FuncInfo, CallSitesInfoMap, MI);
 
@@ -2897,11 +2898,11 @@ MCSymbol *AsmPrinter::getMBBExceptionSym(const MachineBasicBlock &MBB) {
   return Res.first->second;
 }
 
-MCSymbol *AsmPrinter::createCallsiteSymbol(const MachineBasicBlock &MBB) {
+MCSymbol *AsmPrinter::createCallsiteEndSymbol(const MachineBasicBlock &MBB) {
   MCContext &Ctx = MF->getContext();
   MCSymbol *Sym = Ctx.createTempSymbol("BB" + Twine(MF->getFunctionNumber()) +
                                        "_" + Twine(MBB.getNumber()) + "_CS");
-  CurrentFnCallsiteSymbols[&MBB].push_back(Sym);
+  CurrentFnCallsiteEndSymbols[&MBB].push_back(Sym);
   return Sym;
 }
 
@@ -2939,7 +2940,7 @@ void AsmPrinter::SetupMachineFunction(MachineFunction &MF) {
   CurrentFnBegin = nullptr;
   CurrentFnBeginLocal = nullptr;
   CurrentSectionBeginSym = nullptr;
-  CurrentFnCallsiteSymbols.clear();
+  CurrentFnCallsiteEndSymbols.clear();
   MBBSectionRanges.clear();
   MBBSectionExceptionSyms.clear();
   bool NeedsLocalForSize = MAI->needsLocalForSize();
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index c27f10077562..2090157a1a91 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -3111,8 +3111,10 @@ void DwarfDebug::emitDebugLocValue(const AsmPrinter &AP, const DIBasicType *BT,
                             &AP](const DbgValueLocEntry &Entry,
                                  DIExpressionCursor &Cursor) -> bool {
     if (Entry.isInt()) {
-      if (BT && (BT->getEncoding() == dwarf::DW_ATE_signed ||
-                 BT->getEncoding() == dwarf::DW_ATE_signed_char))
+      if (BT && (BT->getEncoding() == dwarf::DW_ATE_boolean))
+        DwarfExpr.addBooleanConstant(Entry.getInt());
+      else if (BT && (BT->getEncoding() == dwarf::DW_ATE_signed ||
+                      BT->getEncoding() == dwarf::DW_ATE_signed_char))
         DwarfExpr.addSignedConstant(Entry.getInt());
       else
         DwarfExpr.addUnsignedConstant(Entry.getInt());
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
index e684054ffa3e..8a30714db2fd 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
@@ -194,6 +194,15 @@ void DwarfExpression::addStackValue() {
     emitOp(dwarf::DW_OP_stack_value);
 }
 
+void DwarfExpression::addBooleanConstant(int64_t Value) {
+  assert(isImplicitLocation() || isUnknownLocation());
+  LocationKind = Implicit;
+  if (Value == 0)
+    emitOp(dwarf::DW_OP_lit0);
+  else
+    emitOp(dwarf::DW_OP_lit1);
+}
+
 void DwarfExpression::addSignedConstant(int64_t Value) {
   assert(isImplicitLocation() || isUnknownLocation());
   LocationKind = Implicit;
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
index 06809ab26387..700e0ec5813e 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
@@ -229,6 +229,9 @@ public:
   /// This needs to be called last to commit any pending changes.
   void finalize();
 
+  /// Emit a boolean constant.
+  void addBooleanConstant(int64_t Value);
+
   /// Emit a signed constant.
   void addSignedConstant(int64_t Value);
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index b03fac2d22a5..d76fd0c01020 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -1351,6 +1351,13 @@ DIE *DwarfUnit::getOrCreateSubprogramDIE(const DISubprogram *SP, bool Minimal) {
       ContextDIE = &getUnitDie();
       // Build the decl now to ensure it precedes the definition.
       getOrCreateSubprogramDIE(SPDecl);
+      // Check whether the DIE for SP has already been created after the call
+      // above.
+      // FIXME: Should the creation of definition subprogram DIE during
+      // the creation of declaration subprogram DIE be allowed?
+      // See https://github.com/llvm/llvm-project/pull/154636.
+      if (DIE *SPDie = getDIE(SP))
+        return SPDie;
     }
   }
 
@@ -1403,11 +1410,8 @@ bool DwarfUnit::applySubprogramDefinitionAttributes(const DISubprogram *SP,
 
   // Add the linkage name if we have one and it isn't in the Decl.
   StringRef LinkageName = SP->getLinkageName();
-  assert(((LinkageName.empty() || DeclLinkageName.empty()) ||
-          LinkageName == DeclLinkageName) &&
-         "decl has a linkage name and it is different");
-  if (DeclLinkageName.empty() &&
-      // Always emit it for abstract subprograms.
+  // Always emit linkage name for abstract subprograms.
+  if (DeclLinkageName != LinkageName &&
       (DD->useAllLinkageNames() || DU->getAbstractScopeDIEs().lookup(SP)))
     addLinkageName(SPDie, LinkageName);
 
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 278dd6560e73..4931403ab83a 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -84,7 +84,7 @@ private:
   bool expandAtomicLoadToCmpXchg(LoadInst *LI);
   StoreInst *convertAtomicStoreToIntegerType(StoreInst *SI);
   bool tryExpandAtomicStore(StoreInst *SI);
-  void expandAtomicStore(StoreInst *SI);
+  void expandAtomicStoreToXChg(StoreInst *SI);
   bool tryExpandAtomicRMW(AtomicRMWInst *AI);
   AtomicRMWInst *convertAtomicXchgToIntegerType(AtomicRMWInst *RMWI);
   Value *
@@ -537,6 +537,9 @@ bool AtomicExpandImpl::tryExpandAtomicLoad(LoadInst *LI) {
   case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
     LI->setAtomic(AtomicOrdering::NotAtomic);
     return true;
+  case TargetLoweringBase::AtomicExpansionKind::CustomExpand:
+    TLI->emitExpandAtomicLoad(LI);
+    return true;
   default:
     llvm_unreachable("Unhandled case in tryExpandAtomicLoad");
   }
@@ -546,8 +549,11 @@ bool AtomicExpandImpl::tryExpandAtomicStore(StoreInst *SI) {
   switch (TLI->shouldExpandAtomicStoreInIR(SI)) {
   case TargetLoweringBase::AtomicExpansionKind::None:
     return false;
+  case TargetLoweringBase::AtomicExpansionKind::CustomExpand:
+    TLI->emitExpandAtomicStore(SI);
+    return true;
   case TargetLoweringBase::AtomicExpansionKind::Expand:
-    expandAtomicStore(SI);
+    expandAtomicStoreToXChg(SI);
     return true;
   case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
     SI->setAtomic(AtomicOrdering::NotAtomic);
@@ -620,7 +626,7 @@ StoreInst *AtomicExpandImpl::convertAtomicStoreToIntegerType(StoreInst *SI) {
   return NewSI;
 }
 
-void AtomicExpandImpl::expandAtomicStore(StoreInst *SI) {
+void AtomicExpandImpl::expandAtomicStoreToXChg(StoreInst *SI) {
   // This function is only called on atomic stores that are too large to be
   // atomic if implemented as a native store. So we replace them by an
   // atomic swap, that can be implemented for example as a ldrex/strex on ARM
@@ -741,7 +747,7 @@ bool AtomicExpandImpl::tryExpandAtomicRMW(AtomicRMWInst *AI) {
   }
   case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
     return lowerAtomicRMWInst(AI);
-  case TargetLoweringBase::AtomicExpansionKind::Expand:
+  case TargetLoweringBase::AtomicExpansionKind::CustomExpand:
     TLI->emitExpandAtomicRMW(AI);
     return true;
   default:
@@ -1454,7 +1460,8 @@ bool AtomicExpandImpl::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
 
   // If the cmpxchg doesn't actually need any ordering when it fails, we can
   // jump straight past that fence instruction (if it exists).
-  Builder.CreateCondBr(ShouldStore, ReleasingStoreBB, NoStoreBB);
+  Builder.CreateCondBr(ShouldStore, ReleasingStoreBB, NoStoreBB,
+                       MDBuilder(F->getContext()).createLikelyBranchWeights());
 
   Builder.SetInsertPoint(ReleasingStoreBB);
   if (ShouldInsertFencesForAtomic && !UseUnconditionalReleaseBarrier)
@@ -1473,7 +1480,8 @@ bool AtomicExpandImpl::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
       StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success");
   BasicBlock *RetryBB = HasReleasedLoadBB ? ReleasedLoadBB : StartBB;
   Builder.CreateCondBr(StoreSuccess, SuccessBB,
-                       CI->isWeak() ? FailureBB : RetryBB);
+                       CI->isWeak() ? FailureBB : RetryBB,
+                       MDBuilder(F->getContext()).createLikelyBranchWeights());
 
   Builder.SetInsertPoint(ReleasedLoadBB);
   Value *SecondLoad;
@@ -1486,7 +1494,9 @@ bool AtomicExpandImpl::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
 
     // If the cmpxchg doesn't actually need any ordering when it fails, we can
     // jump straight past that fence instruction (if it exists).
-    Builder.CreateCondBr(ShouldStore, TryStoreBB, NoStoreBB);
+    Builder.CreateCondBr(
+        ShouldStore, TryStoreBB, NoStoreBB,
+        MDBuilder(F->getContext()).createLikelyBranchWeights());
     // Update PHI node in TryStoreBB.
     LoadedTryStore->addIncoming(SecondLoad, ReleasedLoadBB);
   } else
@@ -1695,7 +1705,7 @@ bool AtomicExpandImpl::tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
     return true;
   case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
     return lowerAtomicCmpXchgInst(CI);
-  case TargetLoweringBase::AtomicExpansionKind::Expand: {
+  case TargetLoweringBase::AtomicExpansionKind::CustomExpand: {
     TLI->emitExpandAtomicCmpXchg(CI);
     return true;
   }
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 0e40a92fd8d6..9db4c9e5e280 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2618,22 +2618,9 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros, LoopInfo &LI,
 bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) {
   BasicBlock *BB = CI->getParent();
 
-  // Lower inline assembly if we can.
-  // If we found an inline asm expession, and if the target knows how to
-  // lower it to normal LLVM code, do so now.
-  if (CI->isInlineAsm()) {
-    if (TLI->ExpandInlineAsm(CI)) {
-      // Avoid invalidating the iterator.
-      CurInstIterator = BB->begin();
-      // Avoid processing instructions out of order, which could cause
-      // reuse before a value is defined.
-      SunkAddrs.clear();
-      return true;
-    }
-    // Sink address computing for memory operands into the block.
-    if (optimizeInlineAsmInst(CI))
-      return true;
-  }
+  // Sink address computing for memory operands into the block.
+  if (CI->isInlineAsm() && optimizeInlineAsmInst(CI))
+    return true;
 
   // Align the pointer arguments to this call if the target thinks it's a good
   // idea
diff --git a/llvm/lib/CodeGen/CodeGenTargetMachineImpl.cpp b/llvm/lib/CodeGen/CodeGenTargetMachineImpl.cpp
index 442ec3840930..5d7e2b59c204 100644
--- a/llvm/lib/CodeGen/CodeGenTargetMachineImpl.cpp
+++ b/llvm/lib/CodeGen/CodeGenTargetMachineImpl.cpp
@@ -45,7 +45,7 @@ static cl::opt<bool> EnableNoTrapAfterNoreturn(
              "after noreturn calls, even if --trap-unreachable is set."));
 
 void CodeGenTargetMachineImpl::initAsmInfo() {
-  MRI.reset(TheTarget.createMCRegInfo(getTargetTriple().str()));
+  MRI.reset(TheTarget.createMCRegInfo(getTargetTriple()));
   assert(MRI && "Unable to create reg info");
   MII.reset(TheTarget.createMCInstrInfo());
   assert(MII && "Unable to create instruction info");
@@ -53,12 +53,12 @@ void CodeGenTargetMachineImpl::initAsmInfo() {
   // to some backends having subtarget feature dependent module level
   // code generation. This is similar to the hack in the AsmPrinter for
   // module level assembly etc.
-  STI.reset(TheTarget.createMCSubtargetInfo(
-      getTargetTriple().str(), getTargetCPU(), getTargetFeatureString()));
+  STI.reset(TheTarget.createMCSubtargetInfo(getTargetTriple(), getTargetCPU(),
+                                            getTargetFeatureString()));
   assert(STI && "Unable to create subtarget info");
 
-  MCAsmInfo *TmpAsmInfo = TheTarget.createMCAsmInfo(
-      *MRI, getTargetTriple().str(), Options.MCOptions);
+  MCAsmInfo *TmpAsmInfo =
+      TheTarget.createMCAsmInfo(*MRI, getTargetTriple(), Options.MCOptions);
   // TargetSelect.h moved to a different directory between LLVM 2.9 and 3.0,
   // and if the old one gets included then MCAsmInfo will be NULL and
   // we'll crash later.
diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp
index 810dc29d728d..0522698adf18 100644
--- a/llvm/lib/CodeGen/CommandFlags.cpp
+++ b/llvm/lib/CodeGen/CommandFlags.cpp
@@ -68,7 +68,6 @@ CGOPT(bool, EnableUnsafeFPMath)
 CGOPT(bool, EnableNoInfsFPMath)
 CGOPT(bool, EnableNoNaNsFPMath)
 CGOPT(bool, EnableNoSignedZerosFPMath)
-CGOPT(bool, EnableApproxFuncFPMath)
 CGOPT(bool, EnableNoTrappingFPMath)
 CGOPT(bool, EnableAIXExtendedAltivecABI)
 CGOPT(DenormalMode::DenormalModeKind, DenormalFPMath)
@@ -245,12 +244,6 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() {
       cl::init(false));
   CGBINDOPT(EnableNoSignedZerosFPMath);
 
-  static cl::opt<bool> EnableApproxFuncFPMath(
-      "enable-approx-func-fp-math",
-      cl::desc("Enable FP math optimizations that assume approx func"),
-      cl::init(false));
-  CGBINDOPT(EnableApproxFuncFPMath);
-
   static cl::opt<bool> EnableNoTrappingFPMath(
       "enable-no-trapping-fp-math",
       cl::desc("Enable setting the FP exceptions build "
@@ -563,7 +556,6 @@ codegen::InitTargetOptionsFromCodeGenFlags(const Triple &TheTriple) {
   Options.NoInfsFPMath = getEnableNoInfsFPMath();
   Options.NoNaNsFPMath = getEnableNoNaNsFPMath();
   Options.NoSignedZerosFPMath = getEnableNoSignedZerosFPMath();
-  Options.ApproxFuncFPMath = getEnableApproxFuncFPMath();
   Options.NoTrappingFPMath = getEnableNoTrappingFPMath();
 
   DenormalMode::DenormalModeKind DenormKind = getDenormalFPMath();
@@ -718,7 +710,6 @@ void codegen::setFunctionAttributes(StringRef CPU, StringRef Features,
   HANDLE_BOOL_ATTR(EnableNoInfsFPMathView, "no-infs-fp-math");
   HANDLE_BOOL_ATTR(EnableNoNaNsFPMathView, "no-nans-fp-math");
   HANDLE_BOOL_ATTR(EnableNoSignedZerosFPMathView, "no-signed-zeros-fp-math");
-  HANDLE_BOOL_ATTR(EnableApproxFuncFPMathView, "approx-func-fp-math");
 
   if (DenormalFPMathView->getNumOccurrences() > 0 &&
       !F.hasFnAttribute("denormal-fp-math")) {
diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index de95e0aaf2cb..7d355e6e365d 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -60,6 +60,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/ComplexDeinterleavingPass.h"
+#include "llvm/ADT/AllocatorList.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
@@ -263,6 +264,7 @@ public:
   };
 
   using Addend = std::pair<Value *, bool>;
+  using AddendList = BumpPtrList<Addend>;
   using CompositeNode = ComplexDeinterleavingCompositeNode::CompositeNode;
 
   // Helper struct for holding info about potential partial multiplication
@@ -291,7 +293,7 @@ private:
   SmallPtrSet<Instruction *, 16> FinalInstructions;
 
   /// Root instructions are instructions from which complex computation starts
-  std::map<Instruction *, CompositeNode *> RootToNode;
+  DenseMap<Instruction *, CompositeNode *> RootToNode;
 
   /// Topologically sorted root instructions
   SmallVector<Instruction *, 1> OrderedRoots;
@@ -339,7 +341,7 @@ private:
   /// ComplexDeinterleavingOperation::ReductionPHI node replacement. It is then
   /// used in the ComplexDeinterleavingOperation::ReductionOperation node
   /// replacement process.
-  std::map<PHINode *, PHINode *> OldToNewPHI;
+  DenseMap<PHINode *, PHINode *> OldToNewPHI;
 
   CompositeNode *prepareCompositeNode(ComplexDeinterleavingOperation Operation,
                                       Value *R, Value *I) {
@@ -417,28 +419,28 @@ private:
   /// and \p ImagAddens. If \p Accumulator is not null, add the result to it.
   /// Return nullptr if it is not possible to construct a complex number.
   /// \p Flags are needed to generate symmetric Add and Sub operations.
-  CompositeNode *identifyAdditions(std::list<Addend> &RealAddends,
-                                   std::list<Addend> &ImagAddends,
+  CompositeNode *identifyAdditions(AddendList &RealAddends,
+                                   AddendList &ImagAddends,
                                    std::optional<FastMathFlags> Flags,
                                    CompositeNode *Accumulator);
 
   /// Extract one addend that have both real and imaginary parts positive.
-  CompositeNode *extractPositiveAddend(std::list<Addend> &RealAddends,
-                                       std::list<Addend> &ImagAddends);
+  CompositeNode *extractPositiveAddend(AddendList &RealAddends,
+                                       AddendList &ImagAddends);
 
   /// Determine if sum of multiplications of complex numbers can be formed from
   /// \p RealMuls and \p ImagMuls. If \p Accumulator is not null, add the result
   /// to it. Return nullptr if it is not possible to construct a complex number.
-  CompositeNode *identifyMultiplications(std::vector<Product> &RealMuls,
-                                         std::vector<Product> &ImagMuls,
+  CompositeNode *identifyMultiplications(SmallVectorImpl<Product> &RealMuls,
+                                         SmallVectorImpl<Product> &ImagMuls,
                                          CompositeNode *Accumulator);
 
   /// Go through pairs of multiplication (one Real and one Imag) and find all
   /// possible candidates for partial multiplication and put them into \p
   /// Candidates. Returns true if all Product has pair with common operand
-  bool collectPartialMuls(const std::vector<Product> &RealMuls,
-                          const std::vector<Product> &ImagMuls,
-                          std::vector<PartialMulCandidate> &Candidates);
+  bool collectPartialMuls(ArrayRef<Product> RealMuls,
+                          ArrayRef<Product> ImagMuls,
+                          SmallVectorImpl<PartialMulCandidate> &Candidates);
 
   /// If the code is compiled with -Ofast or expressions have `reassoc` flag,
   /// the order of complex computation operations may be significantly altered,
@@ -1255,8 +1257,8 @@ ComplexDeinterleavingGraph::identifyReassocNodes(Instruction *Real,
   // Collect multiplications and addend instructions from the given instruction
   // while traversing it operands. Additionally, verify that all instructions
   // have the same fast math flags.
-  auto Collect = [&Flags](Instruction *Insn, std::vector<Product> &Muls,
-                          std::list<Addend> &Addends) -> bool {
+  auto Collect = [&Flags](Instruction *Insn, SmallVectorImpl<Product> &Muls,
+                          AddendList &Addends) -> bool {
     SmallVector<PointerIntPair<Value *, 1, bool>> Worklist = {{Insn, true}};
     SmallPtrSet<Value *, 8> Visited;
     while (!Worklist.empty()) {
@@ -1336,8 +1338,8 @@ ComplexDeinterleavingGraph::identifyReassocNodes(Instruction *Real,
     return true;
   };
 
-  std::vector<Product> RealMuls, ImagMuls;
-  std::list<Addend> RealAddends, ImagAddends;
+  SmallVector<Product> RealMuls, ImagMuls;
+  AddendList RealAddends, ImagAddends;
   if (!Collect(Real, RealMuls, RealAddends) ||
       !Collect(Imag, ImagMuls, ImagAddends))
     return nullptr;
@@ -1371,8 +1373,8 @@ ComplexDeinterleavingGraph::identifyReassocNodes(Instruction *Real,
 }
 
 bool ComplexDeinterleavingGraph::collectPartialMuls(
-    const std::vector<Product> &RealMuls, const std::vector<Product> &ImagMuls,
-    std::vector<PartialMulCandidate> &PartialMulCandidates) {
+    ArrayRef<Product> RealMuls, ArrayRef<Product> ImagMuls,
+    SmallVectorImpl<PartialMulCandidate> &PartialMulCandidates) {
   // Helper function to extract a common operand from two products
   auto FindCommonInstruction = [](const Product &Real,
                                   const Product &Imag) -> Value * {
@@ -1423,18 +1425,18 @@ bool ComplexDeinterleavingGraph::collectPartialMuls(
 
 ComplexDeinterleavingGraph::CompositeNode *
 ComplexDeinterleavingGraph::identifyMultiplications(
-    std::vector<Product> &RealMuls, std::vector<Product> &ImagMuls,
+    SmallVectorImpl<Product> &RealMuls, SmallVectorImpl<Product> &ImagMuls,
     CompositeNode *Accumulator = nullptr) {
   if (RealMuls.size() != ImagMuls.size())
     return nullptr;
 
-  std::vector<PartialMulCandidate> Info;
+  SmallVector<PartialMulCandidate> Info;
   if (!collectPartialMuls(RealMuls, ImagMuls, Info))
     return nullptr;
 
   // Map to store common instruction to node pointers
-  std::map<Value *, CompositeNode *> CommonToNode;
-  std::vector<bool> Processed(Info.size(), false);
+  DenseMap<Value *, CompositeNode *> CommonToNode;
+  SmallVector<bool> Processed(Info.size(), false);
   for (unsigned I = 0; I < Info.size(); ++I) {
     if (Processed[I])
       continue;
@@ -1463,8 +1465,8 @@ ComplexDeinterleavingGraph::identifyMultiplications(
     }
   }
 
-  std::vector<bool> ProcessedReal(RealMuls.size(), false);
-  std::vector<bool> ProcessedImag(ImagMuls.size(), false);
+  SmallVector<bool> ProcessedReal(RealMuls.size(), false);
+  SmallVector<bool> ProcessedImag(ImagMuls.size(), false);
   CompositeNode *Result = Accumulator;
   for (auto &PMI : Info) {
     if (ProcessedReal[PMI.RealIdx] || ProcessedImag[PMI.ImagIdx])
@@ -1580,7 +1582,7 @@ ComplexDeinterleavingGraph::identifyMultiplications(
 
 ComplexDeinterleavingGraph::CompositeNode *
 ComplexDeinterleavingGraph::identifyAdditions(
-    std::list<Addend> &RealAddends, std::list<Addend> &ImagAddends,
+    AddendList &RealAddends, AddendList &ImagAddends,
     std::optional<FastMathFlags> Flags, CompositeNode *Accumulator = nullptr) {
   if (RealAddends.size() != ImagAddends.size())
     return nullptr;
@@ -1671,8 +1673,8 @@ ComplexDeinterleavingGraph::identifyAdditions(
 }
 
 ComplexDeinterleavingGraph::CompositeNode *
-ComplexDeinterleavingGraph::extractPositiveAddend(
-    std::list<Addend> &RealAddends, std::list<Addend> &ImagAddends) {
+ComplexDeinterleavingGraph::extractPositiveAddend(AddendList &RealAddends,
+                                                  AddendList &ImagAddends) {
   for (auto ItR = RealAddends.begin(); ItR != RealAddends.end(); ++ItR) {
     for (auto ItI = ImagAddends.begin(); ItI != ImagAddends.end(); ++ItI) {
       auto [R, IsPositiveR] = *ItR;
diff --git a/llvm/lib/CodeGen/ExpandFp.cpp b/llvm/lib/CodeGen/ExpandFp.cpp
index 1c1047c1ce18..9cc6c6a706c5 100644
--- a/llvm/lib/CodeGen/ExpandFp.cpp
+++ b/llvm/lib/CodeGen/ExpandFp.cpp
@@ -16,18 +16,29 @@
 
 #include "llvm/CodeGen/ExpandFp.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/SimplifyQuery.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/IR/RuntimeLibcalls.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <optional>
+
+#define DEBUG_TYPE "expand-fp"
 
 using namespace llvm;
 
@@ -37,6 +48,359 @@ static cl::opt<unsigned>
                         cl::desc("fp convert instructions on integers with "
                                  "more than <N> bits are expanded."));
 
+namespace {
+/// This class implements a precise expansion of the frem instruction.
+/// The generated code is based on the fmod implementation in the AMD device
+/// libs.
+class FRemExpander {
+  /// The IRBuilder to use for the expansion.
+  IRBuilder<> &B;
+
+  /// Floating point type of the return value and the arguments of the FRem
+  /// instructions that should be expanded.
+  Type *FremTy;
+
+  /// Floating point type to use for the computation.  This may be
+  /// wider than the \p FremTy.
+  Type *ComputeFpTy;
+
+  /// Integer type used to hold the exponents returned by frexp.
+  Type *ExTy;
+
+  /// How many bits of the quotient to compute per iteration of the
+  /// algorithm, stored as a value of type \p ExTy.
+  Value *Bits;
+
+  /// Constant 1 of type \p ExTy.
+  Value *One;
+
+public:
+  static bool canExpandType(Type *Ty) {
+    // TODO The expansion should work for other floating point types
+    // as well, but this would require additional testing.
+    return Ty->isIEEELikeFPTy() && !Ty->isBFloatTy() && !Ty->isFP128Ty();
+  }
+
+  static FRemExpander create(IRBuilder<> &B, Type *Ty) {
+    assert(canExpandType(Ty));
+
+    // The type to use for the computation of the remainder. This may be
+    // wider than the input/result type which affects the ...
+    Type *ComputeTy = Ty;
+    // ... maximum number of iterations of the remainder computation loop
+    // to use. This value is for the case in which the computation
+    // uses the same input/result type.
+    unsigned MaxIter = 2;
+
+    if (Ty->isHalfTy()) {
+      // Use the wider type and less iterations.
+      ComputeTy = B.getFloatTy();
+      MaxIter = 1;
+    }
+
+    unsigned Precision =
+        llvm::APFloat::semanticsPrecision(Ty->getFltSemantics());
+    return FRemExpander{B, Ty, Precision / MaxIter, ComputeTy};
+  }
+
+  /// Build the FRem expansion for the numerator \p X and the
+  /// denumerator \p Y.  The type of X and Y must match \p FremTy. The
+  /// code will be generated at the insertion point of \p B and the
+  /// insertion point will be reset at exit.
+  Value *buildFRem(Value *X, Value *Y, std::optional<SimplifyQuery> &SQ) const;
+
+  /// Build an approximate FRem expansion for the numerator \p X and
+  /// the denumerator \p Y at the insertion point of builder \p B.
+  /// The type of X and Y must match \p FremTy.
+  Value *buildApproxFRem(Value *X, Value *Y) const;
+
+private:
+  FRemExpander(IRBuilder<> &B, Type *FremTy, unsigned Bits, Type *ComputeFpTy)
+      : B(B), FremTy(FremTy), ComputeFpTy(ComputeFpTy), ExTy(B.getInt32Ty()),
+        Bits(ConstantInt::get(ExTy, Bits)), One(ConstantInt::get(ExTy, 1)) {};
+
+  Value *createRcp(Value *V, const Twine &Name) const {
+    // Leave it to later optimizations to turn this into an rcp
+    // instruction if available.
+    return B.CreateFDiv(ConstantFP::get(ComputeFpTy, 1.0), V, Name);
+  }
+
+  // Helper function to build the UPDATE_AX code which is common to the
+  // loop body and the "final iteration".
+  Value *buildUpdateAx(Value *Ax, Value *Ay, Value *Ayinv) const {
+    // Build:
+    //   float q = rint(ax * ayinv);
+    //   ax = fma(-q, ay, ax);
+    //   int clt = ax < 0.0f;
+    //   float axp = ax + ay;
+    //   ax = clt ? axp : ax;
+    Value *Q = B.CreateUnaryIntrinsic(Intrinsic::rint, B.CreateFMul(Ax, Ayinv),
+                                      {}, "q");
+    Value *AxUpdate = B.CreateFMA(B.CreateFNeg(Q), Ay, Ax, {}, "ax");
+    Value *Clt = B.CreateFCmp(CmpInst::FCMP_OLT, AxUpdate,
+                              ConstantFP::getZero(ComputeFpTy), "clt");
+    Value *Axp = B.CreateFAdd(AxUpdate, Ay, "axp");
+    return B.CreateSelect(Clt, Axp, AxUpdate, "ax");
+  }
+
+  /// Build code to extract the exponent and mantissa of \p Src.
+  /// Return the exponent minus one for use as a loop bound and
+  /// the mantissa taken to the given \p NewExp power.
+  std::pair<Value *, Value *> buildExpAndPower(Value *Src, Value *NewExp,
+                                               const Twine &ExName,
+                                               const Twine &PowName) const {
+    // Build:
+    //   ExName = frexp_exp(Src) - 1;
+    //   PowName = fldexp(frexp_mant(ExName), NewExp);
+    Type *Ty = Src->getType();
+    Type *ExTy = B.getInt32Ty();
+    Value *Frexp = B.CreateIntrinsic(Intrinsic::frexp, {Ty, ExTy}, Src);
+    Value *Mant = B.CreateExtractValue(Frexp, {0});
+    Value *Exp = B.CreateExtractValue(Frexp, {1});
+
+    Exp = B.CreateSub(Exp, One, ExName);
+    Value *Pow = B.CreateLdexp(Mant, NewExp, {}, PowName);
+
+    return {Pow, Exp};
+  }
+
+  /// Build the main computation of the remainder for the case in which
+  /// Ax > Ay, where Ax = |X|, Ay = |Y|, and X is the numerator and Y the
+  /// denumerator. Add the incoming edge from the computation result
+  /// to \p RetPhi.
+  void buildRemainderComputation(Value *AxInitial, Value *AyInitial, Value *X,
+                                 PHINode *RetPhi, FastMathFlags FMF) const {
+    IRBuilder<>::FastMathFlagGuard Guard(B);
+    B.setFastMathFlags(FMF);
+
+    // Build:
+    // ex = frexp_exp(ax) - 1;
+    // ax = fldexp(frexp_mant(ax), bits);
+    // ey = frexp_exp(ay) - 1;
+    // ay = fledxp(frexp_mant(ay), 1);
+    auto [Ax, Ex] = buildExpAndPower(AxInitial, Bits, "ex", "ax");
+    auto [Ay, Ey] = buildExpAndPower(AyInitial, One, "ey", "ay");
+
+    // Build:
+    //   int nb = ex - ey;
+    //   float ayinv = 1.0/ay;
+    Value *Nb = B.CreateSub(Ex, Ey, "nb");
+    Value *Ayinv = createRcp(Ay, "ayinv");
+
+    // Build: while (nb > bits)
+    BasicBlock *PreheaderBB = B.GetInsertBlock();
+    Function *Fun = PreheaderBB->getParent();
+    auto *LoopBB = BasicBlock::Create(B.getContext(), "frem.loop_body", Fun);
+    auto *ExitBB = BasicBlock::Create(B.getContext(), "frem.loop_exit", Fun);
+
+    B.CreateCondBr(B.CreateICmp(CmpInst::ICMP_SGT, Nb, Bits), LoopBB, ExitBB);
+
+    // Build loop body:
+    //   UPDATE_AX
+    //   ax = fldexp(ax, bits);
+    //   nb -= bits;
+    // One iteration of the loop is factored out.  The code shared by
+    // the loop and this "iteration" is denoted by UPDATE_AX.
+    B.SetInsertPoint(LoopBB);
+    PHINode *NbIv = B.CreatePHI(Nb->getType(), 2, "nb_iv");
+    NbIv->addIncoming(Nb, PreheaderBB);
+
+    auto *AxPhi = B.CreatePHI(ComputeFpTy, 2, "ax_loop_phi");
+    AxPhi->addIncoming(Ax, PreheaderBB);
+
+    Value *AxPhiUpdate = buildUpdateAx(AxPhi, Ay, Ayinv);
+    AxPhiUpdate = B.CreateLdexp(AxPhiUpdate, Bits, {}, "ax_update");
+    AxPhi->addIncoming(AxPhiUpdate, LoopBB);
+    NbIv->addIncoming(B.CreateSub(NbIv, Bits, "nb_update"), LoopBB);
+
+    B.CreateCondBr(B.CreateICmp(CmpInst::ICMP_SGT, NbIv, Bits), LoopBB, ExitBB);
+
+    // Build final iteration
+    //   ax = fldexp(ax, nb - bits + 1);
+    //   UPDATE_AX
+    B.SetInsertPoint(ExitBB);
+
+    auto *AxPhiExit = B.CreatePHI(ComputeFpTy, 2, "ax_exit_phi");
+    AxPhiExit->addIncoming(Ax, PreheaderBB);
+    AxPhiExit->addIncoming(AxPhi, LoopBB);
+    auto *NbExitPhi = B.CreatePHI(Nb->getType(), 2, "nb_exit_phi");
+    NbExitPhi->addIncoming(NbIv, LoopBB);
+    NbExitPhi->addIncoming(Nb, PreheaderBB);
+
+    Value *AxFinal = B.CreateLdexp(
+        AxPhiExit, B.CreateAdd(B.CreateSub(NbExitPhi, Bits), One), {}, "ax");
+    AxFinal = buildUpdateAx(AxFinal, Ay, Ayinv);
+
+    // Build:
+    //    ax = fldexp(ax, ey);
+    //    ret = copysign(ax,x);
+    AxFinal = B.CreateLdexp(AxFinal, Ey, {}, "ax");
+    if (ComputeFpTy != FremTy)
+      AxFinal = B.CreateFPTrunc(AxFinal, FremTy);
+    Value *Ret = B.CreateCopySign(AxFinal, X);
+
+    RetPhi->addIncoming(Ret, ExitBB);
+  }
+
+  /// Build the else-branch of the conditional in the FRem
+  /// expansion, i.e. the case in wich Ax <= Ay, where Ax = |X|, Ay
+  /// = |Y|, and X is the numerator and Y the denumerator. Add the
+  /// incoming edge from the result to \p RetPhi.
+  void buildElseBranch(Value *Ax, Value *Ay, Value *X, PHINode *RetPhi) const {
+    // Build:
+    // ret = ax == ay ? copysign(0.0f, x) : x;
+    Value *ZeroWithXSign = B.CreateCopySign(ConstantFP::getZero(FremTy), X);
+    Value *Ret = B.CreateSelect(B.CreateFCmpOEQ(Ax, Ay), ZeroWithXSign, X);
+
+    RetPhi->addIncoming(Ret, B.GetInsertBlock());
+  }
+
+  /// Return a value that is NaN if one of the corner cases concerning
+  /// the inputs \p X and \p Y is detected, and \p Ret otherwise.
+  Value *handleInputCornerCases(Value *Ret, Value *X, Value *Y,
+                                std::optional<SimplifyQuery> &SQ,
+                                bool NoInfs) const {
+    // Build:
+    //   ret = (y == 0.0f || isnan(y)) ? QNAN : ret;
+    //   ret = isfinite(x) ? ret : QNAN;
+    Value *Nan = ConstantFP::getQNaN(FremTy);
+    Ret = B.CreateSelect(B.CreateFCmpUEQ(Y, ConstantFP::getZero(FremTy)), Nan,
+                         Ret);
+    Value *XFinite =
+        NoInfs || (SQ && isKnownNeverInfinity(X, *SQ))
+            ? B.getTrue()
+            : B.CreateFCmpULT(B.CreateUnaryIntrinsic(Intrinsic::fabs, X),
+                              ConstantFP::getInfinity(FremTy));
+    Ret = B.CreateSelect(XFinite, Ret, Nan);
+
+    return Ret;
+  }
+};
+
+Value *FRemExpander::buildApproxFRem(Value *X, Value *Y) const {
+  IRBuilder<>::FastMathFlagGuard Guard(B);
+  // Propagating the approximate functions flag to the
+  // division leads to an unacceptable drop in precision
+  // on AMDGPU.
+  // TODO Find out if any flags might be worth propagating.
+  B.clearFastMathFlags();
+
+  Value *Quot = B.CreateFDiv(X, Y);
+  Value *Trunc = B.CreateUnaryIntrinsic(Intrinsic::trunc, Quot, {});
+  Value *Neg = B.CreateFNeg(Trunc);
+
+  return B.CreateFMA(Neg, Y, X);
+}
+
+Value *FRemExpander::buildFRem(Value *X, Value *Y,
+                               std::optional<SimplifyQuery> &SQ) const {
+  assert(X->getType() == FremTy && Y->getType() == FremTy);
+
+  FastMathFlags FMF = B.getFastMathFlags();
+
+  // This function generates the following code structure:
+  //   if (abs(x) > abs(y))
+  //   { ret = compute remainder }
+  //   else
+  //   { ret = x or 0 with sign of x }
+  //   Adjust ret to NaN/inf in input
+  //   return ret
+  Value *Ax = B.CreateUnaryIntrinsic(Intrinsic::fabs, X, {}, "ax");
+  Value *Ay = B.CreateUnaryIntrinsic(Intrinsic::fabs, Y, {}, "ay");
+  if (ComputeFpTy != X->getType()) {
+    Ax = B.CreateFPExt(Ax, ComputeFpTy, "ax");
+    Ay = B.CreateFPExt(Ay, ComputeFpTy, "ay");
+  }
+  Value *AxAyCmp = B.CreateFCmpOGT(Ax, Ay);
+
+  PHINode *RetPhi = B.CreatePHI(FremTy, 2, "ret");
+  Value *Ret = RetPhi;
+
+  // We would return NaN in all corner cases handled here.
+  // Hence, if NaNs are excluded, keep the result as it is.
+  if (!FMF.noNaNs())
+    Ret = handleInputCornerCases(Ret, X, Y, SQ, FMF.noInfs());
+
+  Function *Fun = B.GetInsertBlock()->getParent();
+  auto *ThenBB = BasicBlock::Create(B.getContext(), "frem.compute", Fun);
+  auto *ElseBB = BasicBlock::Create(B.getContext(), "frem.else", Fun);
+  SplitBlockAndInsertIfThenElse(AxAyCmp, RetPhi, &ThenBB, &ElseBB);
+
+  auto SavedInsertPt = B.GetInsertPoint();
+
+  // Build remainder computation for "then" branch
+  //
+  // The ordered comparison ensures that ax and ay are not NaNs
+  // in the then-branch. Furthermore, y cannot be an infinity and the
+  // check at the end of the function ensures that the result will not
+  // be used if x is an infinity.
+  FastMathFlags ComputeFMF = FMF;
+  ComputeFMF.setNoInfs();
+  ComputeFMF.setNoNaNs();
+
+  B.SetInsertPoint(ThenBB);
+  buildRemainderComputation(Ax, Ay, X, RetPhi, FMF);
+  B.CreateBr(RetPhi->getParent());
+
+  // Build "else"-branch
+  B.SetInsertPoint(ElseBB);
+  buildElseBranch(Ax, Ay, X, RetPhi);
+  B.CreateBr(RetPhi->getParent());
+
+  B.SetInsertPoint(SavedInsertPt);
+
+  return Ret;
+}
+} // namespace
+
+static bool expandFRem(BinaryOperator &I, std::optional<SimplifyQuery> &SQ) {
+  LLVM_DEBUG(dbgs() << "Expanding instruction: " << I << '\n');
+
+  Type *ReturnTy = I.getType();
+  assert(FRemExpander::canExpandType(ReturnTy->getScalarType()));
+
+  FastMathFlags FMF = I.getFastMathFlags();
+  // TODO Make use of those flags for optimization?
+  FMF.setAllowReciprocal(false);
+  FMF.setAllowContract(false);
+
+  IRBuilder<> B(&I);
+  B.setFastMathFlags(FMF);
+  B.SetCurrentDebugLocation(I.getDebugLoc());
+
+  Type *ElemTy = ReturnTy->getScalarType();
+  const FRemExpander Expander = FRemExpander::create(B, ElemTy);
+
+  Value *Ret;
+  if (ReturnTy->isFloatingPointTy())
+    Ret = FMF.approxFunc()
+              ? Expander.buildApproxFRem(I.getOperand(0), I.getOperand(1))
+              : Expander.buildFRem(I.getOperand(0), I.getOperand(1), SQ);
+  else {
+    auto *VecTy = cast<FixedVectorType>(ReturnTy);
+
+    // This could use SplitBlockAndInsertForEachLane but the interface
+    // is a bit awkward for a constant number of elements and it will
+    // boil down to the same code.
+    // TODO Expand the FRem instruction only once and reuse the code.
+    Value *Nums = I.getOperand(0);
+    Value *Denums = I.getOperand(1);
+    Ret = PoisonValue::get(I.getType());
+    for (int I = 0, E = VecTy->getNumElements(); I != E; ++I) {
+      Value *Num = B.CreateExtractElement(Nums, I);
+      Value *Denum = B.CreateExtractElement(Denums, I);
+      Value *Rem = FMF.approxFunc() ? Expander.buildApproxFRem(Num, Denum)
+                                    : Expander.buildFRem(Num, Denum, SQ);
+      Ret = B.CreateInsertElement(Ret, Rem, I);
+    }
+  }
+
+  I.replaceAllUsesWith(Ret);
+  Ret->takeName(&I);
+  I.eraseFromParent();
+
+  return true;
+}
 // clang-format off: preserve formatting of the following example
 
 /// Generate code to convert a fp number to integer, replacing FPToS(U)I with
@@ -64,8 +428,8 @@ static cl::opt<unsigned>
 ///   br i1 %cmp6.not, label %if.end12, label %if.then8
 ///
 /// if.then8:                                         ; preds = %if.end
-///   %cond11 = select i1 %tobool.not, i64 9223372036854775807, i64 -9223372036854775808
-///   br label %cleanup
+///   %cond11 = select i1 %tobool.not, i64 9223372036854775807, i64
+///   -9223372036854775808 br label %cleanup
 ///
 /// if.end12:                                         ; preds = %if.end
 ///   %cmp13 = icmp ult i64 %shr, 150
@@ -83,9 +447,10 @@ static cl::opt<unsigned>
 ///   %mul19 = mul nsw i64 %shl, %conv
 ///   br label %cleanup
 ///
-/// cleanup:                                          ; preds = %entry, %if.else, %if.then15, %if.then8
-///   %retval.0 = phi i64 [ %cond11, %if.then8 ], [ %mul, %if.then15 ], [ %mul19, %if.else ], [ 0, %entry ]
-///   ret i64 %retval.0
+/// cleanup:                                          ; preds = %entry,
+/// %if.else, %if.then15, %if.then8
+///   %retval.0 = phi i64 [ %cond11, %if.then8 ], [ %mul, %if.then15 ], [
+///   %mul19, %if.else ], [ 0, %entry ] ret i64 %retval.0
 /// }
 ///
 /// Replace fp to integer with generated code.
@@ -272,13 +637,11 @@ static void expandFPToI(Instruction *FPToI) {
 ///   %or = or i64 %shr6, %conv11
 ///   br label %sw.epilog
 ///
-/// sw.epilog:                                        ; preds = %sw.default, %if.then4, %sw.bb
-///   %a.addr.0 = phi i64 [ %or, %sw.default ], [ %sub, %if.then4 ], [ %shl, %sw.bb ]
-///   %1 = lshr i64 %a.addr.0, 2
-///   %2 = and i64 %1, 1
-///   %or16 = or i64 %2, %a.addr.0
-///   %inc = add nsw i64 %or16, 1
-///   %3 = and i64 %inc, 67108864
+/// sw.epilog:                                        ; preds = %sw.default,
+/// %if.then4, %sw.bb
+///   %a.addr.0 = phi i64 [ %or, %sw.default ], [ %sub, %if.then4 ], [ %shl,
+///   %sw.bb ] %1 = lshr i64 %a.addr.0, 2 %2 = and i64 %1, 1 %or16 = or i64 %2,
+///   %a.addr.0 %inc = add nsw i64 %or16, 1 %3 = and i64 %inc, 67108864
 ///   %tobool.not = icmp eq i64 %3, 0
 ///   %spec.select.v = select i1 %tobool.not, i64 2, i64 3
 ///   %spec.select = ashr i64 %inc, %spec.select.v
@@ -291,7 +654,8 @@ static void expandFPToI(Instruction *FPToI) {
 ///   %shl25 = shl i64 %sub, %sh_prom24
 ///   br label %if.end26
 ///
-/// if.end26:                                         ; preds = %sw.epilog, %if.else
+/// if.end26:                                         ; preds = %sw.epilog,
+/// %if.else
 ///   %a.addr.1 = phi i64 [ %shl25, %if.else ], [ %spec.select, %sw.epilog ]
 ///   %e.0 = phi i32 [ %sub2, %if.else ], [ %spec.select56, %sw.epilog ]
 ///   %conv27 = trunc i64 %shr to i32
@@ -305,7 +669,8 @@ static void expandFPToI(Instruction *FPToI) {
 ///   %4 = bitcast i32 %or33 to float
 ///   br label %return
 ///
-/// return:                                           ; preds = %entry, %if.end26
+/// return:                                           ; preds = %entry,
+/// %if.end26
 ///   %retval.0 = phi float [ %4, %if.end26 ], [ 0.000000e+00, %entry ]
 ///   ret float %retval.0
 /// }
@@ -594,7 +959,38 @@ static void scalarize(Instruction *I, SmallVectorImpl<Instruction *> &Replace) {
   I->eraseFromParent();
 }
 
-static bool runImpl(Function &F, const TargetLowering &TLI) {
+// This covers all floating point types; more than we need here.
+// TODO Move somewhere else for general use?
+/// Return the Libcall for a frem instruction of
+/// type \p Ty.
+static RTLIB::Libcall fremToLibcall(Type *Ty) {
+  assert(Ty->isFloatingPointTy());
+  if (Ty->isFloatTy() || Ty->is16bitFPTy())
+    return RTLIB::REM_F32;
+  if (Ty->isDoubleTy())
+    return RTLIB::REM_F64;
+  if (Ty->isFP128Ty())
+    return RTLIB::REM_F128;
+  if (Ty->isX86_FP80Ty())
+    return RTLIB::REM_F80;
+  if (Ty->isPPC_FP128Ty())
+    return RTLIB::REM_PPCF128;
+
+  llvm_unreachable("Unknown floating point type");
+}
+
+/* Return true if, according to \p LibInfo, the target either directly
+   supports the frem instruction for the \p Ty, has a custom lowering,
+   or uses a libcall. */
+static bool targetSupportsFrem(const TargetLowering &TLI, Type *Ty) {
+  if (!TLI.isOperationExpand(ISD::FREM, EVT::getEVT(Ty)))
+    return true;
+
+  return TLI.getLibcallName(fremToLibcall(Ty->getScalarType()));
+}
+
+static bool runImpl(Function &F, const TargetLowering &TLI,
+                    AssumptionCache *AC) {
   SmallVector<Instruction *, 4> Replace;
   SmallVector<Instruction *, 4> ReplaceVector;
   bool Modified = false;
@@ -609,6 +1005,21 @@ static bool runImpl(Function &F, const TargetLowering &TLI) {
 
   for (auto &I : instructions(F)) {
     switch (I.getOpcode()) {
+    case Instruction::FRem: {
+      Type *Ty = I.getType();
+      // TODO: This pass doesn't handle scalable vectors.
+      if (Ty->isScalableTy())
+        continue;
+
+      if (targetSupportsFrem(TLI, Ty) ||
+          !FRemExpander::canExpandType(Ty->getScalarType()))
+        continue;
+
+      Replace.push_back(&I);
+      Modified = true;
+
+      break;
+    }
     case Instruction::FPToUI:
     case Instruction::FPToSI: {
       // TODO: This pass doesn't handle scalable vectors.
@@ -659,8 +1070,20 @@ static bool runImpl(Function &F, const TargetLowering &TLI) {
 
   while (!Replace.empty()) {
     Instruction *I = Replace.pop_back_val();
-    if (I->getOpcode() == Instruction::FPToUI ||
-        I->getOpcode() == Instruction::FPToSI) {
+    if (I->getOpcode() == Instruction::FRem) {
+      auto SQ = [&]() -> std::optional<SimplifyQuery> {
+        if (AC) {
+          auto Res = std::make_optional<SimplifyQuery>(
+              I->getModule()->getDataLayout(), I);
+          Res->AC = AC;
+          return Res;
+        }
+        return {};
+      }();
+
+      expandFRem(cast<BinaryOperator>(*I), SQ);
+    } else if (I->getOpcode() == Instruction::FPToUI ||
+               I->getOpcode() == Instruction::FPToSI) {
       expandFPToI(I);
     } else {
       expandIToFP(I);
@@ -672,31 +1095,58 @@ static bool runImpl(Function &F, const TargetLowering &TLI) {
 
 namespace {
 class ExpandFpLegacyPass : public FunctionPass {
+  CodeGenOptLevel OptLevel;
+
 public:
   static char ID;
 
-  ExpandFpLegacyPass() : FunctionPass(ID) {
+  ExpandFpLegacyPass(CodeGenOptLevel OptLevel)
+      : FunctionPass(ID), OptLevel(OptLevel) {
     initializeExpandFpLegacyPassPass(*PassRegistry::getPassRegistry());
   }
 
+  ExpandFpLegacyPass() : ExpandFpLegacyPass(CodeGenOptLevel::None) {};
+
   bool runOnFunction(Function &F) override {
     auto *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
     auto *TLI = TM->getSubtargetImpl(F)->getTargetLowering();
-    return runImpl(F, *TLI);
+    AssumptionCache *AC = nullptr;
+
+    if (OptLevel != CodeGenOptLevel::None && !F.hasOptNone())
+      AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    return runImpl(F, *TLI, AC);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<TargetPassConfig>();
+    if (OptLevel != CodeGenOptLevel::None)
+      AU.addRequired<AssumptionCacheTracker>();
     AU.addPreserved<AAResultsWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
   }
 };
 } // namespace
 
+ExpandFpPass::ExpandFpPass(const TargetMachine *TM, CodeGenOptLevel OptLevel)
+    : TM(TM), OptLevel(OptLevel) {}
+
+void ExpandFpPass::printPipeline(
+    raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
+  static_cast<PassInfoMixin<ExpandFpPass> *>(this)->printPipeline(
+      OS, MapClassName2PassName);
+  OS << '<';
+  OS << "O" << (int)OptLevel;
+  OS << '>';
+}
+
 PreservedAnalyses ExpandFpPass::run(Function &F, FunctionAnalysisManager &FAM) {
   const TargetSubtargetInfo *STI = TM->getSubtargetImpl(F);
-  return runImpl(F, *STI->getTargetLowering()) ? PreservedAnalyses::none()
-                                               : PreservedAnalyses::all();
+  auto &TLI = *STI->getTargetLowering();
+  AssumptionCache *AC = nullptr;
+  if (OptLevel != CodeGenOptLevel::None)
+    AC = &FAM.getResult<AssumptionAnalysis>(F);
+  return runImpl(F, TLI, AC) ? PreservedAnalyses::none()
+                             : PreservedAnalyses::all();
 }
 
 char ExpandFpLegacyPass::ID = 0;
@@ -704,4 +1154,6 @@ INITIALIZE_PASS_BEGIN(ExpandFpLegacyPass, "expand-fp",
                       "Expand certain fp instructions", false, false)
 INITIALIZE_PASS_END(ExpandFpLegacyPass, "expand-fp", "Expand fp", false, false)
 
-FunctionPass *llvm::createExpandFpPass() { return new ExpandFpLegacyPass(); }
+FunctionPass *llvm::createExpandFpPass(CodeGenOptLevel OptLevel) {
+  return new ExpandFpLegacyPass(OptLevel);
+}
diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
index 753c65600770..03abc042e556 100644
--- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp
+++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
@@ -150,9 +150,8 @@ struct CachingVPExpander {
                           ElementCount ElemCount);
 
   /// If needed, folds the EVL in the mask operand and discards the EVL
-  /// parameter. Returns a pair of the value of the intrinsic after the change
-  /// (if any) and whether the mask was actually folded.
-  std::pair<Value *, bool> foldEVLIntoMask(VPIntrinsic &VPI);
+  /// parameter. Returns true if the mask was actually folded.
+  bool foldEVLIntoMask(VPIntrinsic &VPI);
 
   /// "Remove" the %evl parameter of \p PI by setting it to the static vector
   /// length of the operation. Returns true if the %evl (if any) was effectively
@@ -160,34 +159,31 @@ struct CachingVPExpander {
   bool discardEVLParameter(VPIntrinsic &PI);
 
   /// Lower this VP binary operator to a unpredicated binary operator.
-  Value *expandPredicationInBinaryOperator(IRBuilder<> &Builder,
-                                           VPIntrinsic &PI);
+  bool expandPredicationInBinaryOperator(IRBuilder<> &Builder, VPIntrinsic &PI);
 
   /// Lower this VP int call to a unpredicated int call.
-  Value *expandPredicationToIntCall(IRBuilder<> &Builder, VPIntrinsic &PI);
+  bool expandPredicationToIntCall(IRBuilder<> &Builder, VPIntrinsic &PI);
 
   /// Lower this VP fp call to a unpredicated fp call.
-  Value *expandPredicationToFPCall(IRBuilder<> &Builder, VPIntrinsic &PI,
-                                   unsigned UnpredicatedIntrinsicID);
+  bool expandPredicationToFPCall(IRBuilder<> &Builder, VPIntrinsic &PI,
+                                 unsigned UnpredicatedIntrinsicID);
 
   /// Lower this VP reduction to a call to an unpredicated reduction intrinsic.
-  Value *expandPredicationInReduction(IRBuilder<> &Builder,
-                                      VPReductionIntrinsic &PI);
+  bool expandPredicationInReduction(IRBuilder<> &Builder,
+                                    VPReductionIntrinsic &PI);
 
   /// Lower this VP cast operation to a non-VP intrinsic.
-  Value *expandPredicationToCastIntrinsic(IRBuilder<> &Builder,
-                                          VPIntrinsic &VPI);
+  bool expandPredicationToCastIntrinsic(IRBuilder<> &Builder, VPIntrinsic &VPI);
 
   /// Lower this VP memory operation to a non-VP intrinsic.
-  Value *expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder,
-                                            VPIntrinsic &VPI);
+  bool expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder,
+                                          VPIntrinsic &VPI);
 
   /// Lower this VP comparison to a call to an unpredicated comparison.
-  Value *expandPredicationInComparison(IRBuilder<> &Builder,
-                                       VPCmpIntrinsic &PI);
+  bool expandPredicationInComparison(IRBuilder<> &Builder, VPCmpIntrinsic &PI);
 
   /// Query TTI and expand the vector predication in \p P accordingly.
-  Value *expandPredication(VPIntrinsic &PI);
+  bool expandPredication(VPIntrinsic &PI);
 
   /// Determine how and whether the VPIntrinsic \p VPI shall be expanded. This
   /// overrides TTI with the cl::opts listed at the top of this file.
@@ -227,9 +223,8 @@ Value *CachingVPExpander::convertEVLToMask(IRBuilder<> &Builder,
   return Builder.CreateICmp(CmpInst::ICMP_ULT, IdxVec, VLSplat);
 }
 
-Value *
-CachingVPExpander::expandPredicationInBinaryOperator(IRBuilder<> &Builder,
-                                                     VPIntrinsic &VPI) {
+bool CachingVPExpander::expandPredicationInBinaryOperator(IRBuilder<> &Builder,
+                                                          VPIntrinsic &VPI) {
   assert((maySpeculateLanes(VPI) || VPI.canIgnoreVectorLengthParam()) &&
          "Implicitly dropping %evl in non-speculatable operator!");
 
@@ -261,14 +256,14 @@ CachingVPExpander::expandPredicationInBinaryOperator(IRBuilder<> &Builder,
   Value *NewBinOp = Builder.CreateBinOp(OC, Op0, Op1, VPI.getName());
 
   replaceOperation(*NewBinOp, VPI);
-  return NewBinOp;
+  return true;
 }
 
-Value *CachingVPExpander::expandPredicationToIntCall(IRBuilder<> &Builder,
-                                                     VPIntrinsic &VPI) {
+bool CachingVPExpander::expandPredicationToIntCall(IRBuilder<> &Builder,
+                                                   VPIntrinsic &VPI) {
   std::optional<unsigned> FID = VPI.getFunctionalIntrinsicID();
   if (!FID)
-    return nullptr;
+    return false;
   SmallVector<Value *, 2> Argument;
   for (unsigned i = 0; i < VPI.getNumOperands() - 3; i++) {
     Argument.push_back(VPI.getOperand(i));
@@ -276,10 +271,10 @@ Value *CachingVPExpander::expandPredicationToIntCall(IRBuilder<> &Builder,
   Value *NewOp = Builder.CreateIntrinsic(FID.value(), {VPI.getType()}, Argument,
                                          /*FMFSource=*/nullptr, VPI.getName());
   replaceOperation(*NewOp, VPI);
-  return NewOp;
+  return true;
 }
 
-Value *CachingVPExpander::expandPredicationToFPCall(
+bool CachingVPExpander::expandPredicationToFPCall(
     IRBuilder<> &Builder, VPIntrinsic &VPI, unsigned UnpredicatedIntrinsicID) {
   assert((maySpeculateLanes(VPI) || VPI.canIgnoreVectorLengthParam()) &&
          "Implicitly dropping %evl in non-speculatable operator!");
@@ -297,7 +292,7 @@ Value *CachingVPExpander::expandPredicationToFPCall(
         UnpredicatedIntrinsicID, {VPI.getType()}, Argument,
         /*FMFSource=*/nullptr, VPI.getName());
     replaceOperation(*NewOp, VPI);
-    return NewOp;
+    return true;
   }
   case Intrinsic::fma:
   case Intrinsic::fmuladd:
@@ -315,11 +310,11 @@ Value *CachingVPExpander::expandPredicationToFPCall(
     else
       NewOp = Builder.CreateCall(Fn, {Op0, Op1, Op2}, VPI.getName());
     replaceOperation(*NewOp, VPI);
-    return NewOp;
+    return true;
   }
   }
 
-  return nullptr;
+  return false;
 }
 
 static Value *getNeutralReductionElement(const VPReductionIntrinsic &VPI,
@@ -331,9 +326,8 @@ static Value *getNeutralReductionElement(const VPReductionIntrinsic &VPI,
   return getReductionIdentity(RdxID, EltTy, FMF);
 }
 
-Value *
-CachingVPExpander::expandPredicationInReduction(IRBuilder<> &Builder,
-                                                VPReductionIntrinsic &VPI) {
+bool CachingVPExpander::expandPredicationInReduction(
+    IRBuilder<> &Builder, VPReductionIntrinsic &VPI) {
   assert((maySpeculateLanes(VPI) || VPI.canIgnoreVectorLengthParam()) &&
          "Implicitly dropping %evl in non-speculatable operator!");
 
@@ -391,11 +385,11 @@ CachingVPExpander::expandPredicationInReduction(IRBuilder<> &Builder,
   }
 
   replaceOperation(*Reduction, VPI);
-  return Reduction;
+  return true;
 }
 
-Value *CachingVPExpander::expandPredicationToCastIntrinsic(IRBuilder<> &Builder,
-                                                           VPIntrinsic &VPI) {
+bool CachingVPExpander::expandPredicationToCastIntrinsic(IRBuilder<> &Builder,
+                                                         VPIntrinsic &VPI) {
   Intrinsic::ID VPID = VPI.getIntrinsicID();
   unsigned CastOpcode = VPIntrinsic::getFunctionalOpcodeForVP(VPID).value();
   assert(Instruction::isCast(CastOpcode));
@@ -404,12 +398,11 @@ Value *CachingVPExpander::expandPredicationToCastIntrinsic(IRBuilder<> &Builder,
                          VPI.getType(), VPI.getName());
 
   replaceOperation(*CastOp, VPI);
-  return CastOp;
+  return true;
 }
 
-Value *
-CachingVPExpander::expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder,
-                                                      VPIntrinsic &VPI) {
+bool CachingVPExpander::expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder,
+                                                           VPIntrinsic &VPI) {
   assert(VPI.canIgnoreVectorLengthParam());
 
   const auto &DL = VPI.getDataLayout();
@@ -469,11 +462,11 @@ CachingVPExpander::expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder,
 
   assert(NewMemoryInst);
   replaceOperation(*NewMemoryInst, VPI);
-  return NewMemoryInst;
+  return true;
 }
 
-Value *CachingVPExpander::expandPredicationInComparison(IRBuilder<> &Builder,
-                                                        VPCmpIntrinsic &VPI) {
+bool CachingVPExpander::expandPredicationInComparison(IRBuilder<> &Builder,
+                                                      VPCmpIntrinsic &VPI) {
   assert((maySpeculateLanes(VPI) || VPI.canIgnoreVectorLengthParam()) &&
          "Implicitly dropping %evl in non-speculatable operator!");
 
@@ -487,7 +480,7 @@ Value *CachingVPExpander::expandPredicationInComparison(IRBuilder<> &Builder,
   auto *NewCmp = Builder.CreateCmp(Pred, Op0, Op1);
 
   replaceOperation(*NewCmp, VPI);
-  return NewCmp;
+  return true;
 }
 
 bool CachingVPExpander::discardEVLParameter(VPIntrinsic &VPI) {
@@ -516,17 +509,24 @@ bool CachingVPExpander::discardEVLParameter(VPIntrinsic &VPI) {
   return true;
 }
 
-std::pair<Value *, bool> CachingVPExpander::foldEVLIntoMask(VPIntrinsic &VPI) {
+bool CachingVPExpander::foldEVLIntoMask(VPIntrinsic &VPI) {
   LLVM_DEBUG(dbgs() << "Folding vlen for " << VPI << '\n');
 
   IRBuilder<> Builder(&VPI);
 
   // Ineffective %evl parameter and so nothing to do here.
   if (VPI.canIgnoreVectorLengthParam())
-    return {&VPI, false};
+    return false;
 
   // Only VP intrinsics can have an %evl parameter.
   Value *OldMaskParam = VPI.getMaskParam();
+  if (!OldMaskParam) {
+    assert((VPI.getIntrinsicID() == Intrinsic::vp_merge ||
+            VPI.getIntrinsicID() == Intrinsic::vp_select) &&
+           "Unexpected VP intrinsic without mask operand");
+    OldMaskParam = VPI.getArgOperand(0);
+  }
+
   Value *OldEVLParam = VPI.getVectorLengthParam();
   assert(OldMaskParam && "no mask param to fold the vl param into");
   assert(OldEVLParam && "no EVL param to fold away");
@@ -538,7 +538,11 @@ std::pair<Value *, bool> CachingVPExpander::foldEVLIntoMask(VPIntrinsic &VPI) {
   ElementCount ElemCount = VPI.getStaticVectorLength();
   Value *VLMask = convertEVLToMask(Builder, OldEVLParam, ElemCount);
   Value *NewMaskParam = Builder.CreateAnd(VLMask, OldMaskParam);
-  VPI.setMaskParam(NewMaskParam);
+  if (VPI.getIntrinsicID() == Intrinsic::vp_merge ||
+      VPI.getIntrinsicID() == Intrinsic::vp_select)
+    VPI.setArgOperand(0, NewMaskParam);
+  else
+    VPI.setMaskParam(NewMaskParam);
 
   // Drop the %evl parameter.
   discardEVLParameter(VPI);
@@ -546,10 +550,10 @@ std::pair<Value *, bool> CachingVPExpander::foldEVLIntoMask(VPIntrinsic &VPI) {
          "transformation did not render the evl param ineffective!");
 
   // Reassess the modified instruction.
-  return {&VPI, true};
+  return true;
 }
 
-Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) {
+bool CachingVPExpander::expandPredication(VPIntrinsic &VPI) {
   LLVM_DEBUG(dbgs() << "Lowering to unpredicated op: " << VPI << '\n');
 
   IRBuilder<> Builder(&VPI);
@@ -566,9 +570,8 @@ Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) {
   if (auto *VPCmp = dyn_cast<VPCmpIntrinsic>(&VPI))
     return expandPredicationInComparison(Builder, *VPCmp);
 
-  if (VPCastIntrinsic::isVPCast(VPI.getIntrinsicID())) {
+  if (VPCastIntrinsic::isVPCast(VPI.getIntrinsicID()))
     return expandPredicationToCastIntrinsic(Builder, VPI);
-  }
 
   switch (VPI.getIntrinsicID()) {
   default:
@@ -578,6 +581,14 @@ Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) {
     replaceOperation(*NewNegOp, VPI);
     return NewNegOp;
   }
+  case Intrinsic::vp_select:
+  case Intrinsic::vp_merge: {
+    assert(maySpeculateLanes(VPI) || VPI.canIgnoreVectorLengthParam());
+    Value *NewSelectOp = Builder.CreateSelect(
+        VPI.getOperand(0), VPI.getOperand(1), VPI.getOperand(2), VPI.getName());
+    replaceOperation(*NewSelectOp, VPI);
+    return NewSelectOp;
+  }
   case Intrinsic::vp_abs:
   case Intrinsic::vp_smax:
   case Intrinsic::vp_smin:
@@ -613,10 +624,10 @@ Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) {
   }
 
   if (auto CID = VPI.getConstrainedIntrinsicID())
-    if (Value *Call = expandPredicationToFPCall(Builder, VPI, *CID))
-      return Call;
+    if (expandPredicationToFPCall(Builder, VPI, *CID))
+      return true;
 
-  return &VPI;
+  return false;
 }
 
 //// } CachingVPExpander
@@ -673,8 +684,7 @@ CachingVPExpander::expandVectorPredication(VPIntrinsic &VPI) {
       Changed = VPExpansionDetails::IntrinsicUpdated;
     break;
   case VPLegalization::Convert:
-    if (auto [NewVPI, Folded] = foldEVLIntoMask(VPI); Folded) {
-      (void)NewVPI;
+    if (foldEVLIntoMask(VPI)) {
       Changed = VPExpansionDetails::IntrinsicUpdated;
       ++NumFoldedVL;
     }
@@ -688,7 +698,7 @@ CachingVPExpander::expandVectorPredication(VPIntrinsic &VPI) {
   case VPLegalization::Discard:
     llvm_unreachable("Invalid strategy for operators.");
   case VPLegalization::Convert:
-    if (Value *V = expandPredication(VPI); V != &VPI) {
+    if (expandPredication(VPI)) {
       ++NumLoweredVPOps;
       Changed = VPExpansionDetails::IntrinsicReplaced;
     }
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index 90a18b86c1b1..b3c312569736 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -1256,7 +1256,7 @@ LLT CallLowering::ValueHandler::getStackValueStoreType(
     if (Flags.isPointer()) {
       LLT PtrTy = LLT::pointer(Flags.getPointerAddrSpace(),
                                ValTy.getScalarSizeInBits());
-      if (ValVT.isVector())
+      if (ValVT.isVector() && ValVT.getVectorNumElements() != 1)
         return LLT::vector(ValTy.getElementCount(), PtrTy);
       return PtrTy;
     }
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 0674f5fd1ae0..0ebee2cfd868 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -2094,6 +2094,68 @@ bool CombinerHelper::matchCommuteShift(MachineInstr &MI,
   return true;
 }
 
+bool CombinerHelper::matchLshrOfTruncOfLshr(MachineInstr &MI,
+                                            LshrOfTruncOfLshr &MatchInfo,
+                                            MachineInstr &ShiftMI) const {
+  assert(MI.getOpcode() == TargetOpcode::G_LSHR && "Expected a G_LSHR");
+
+  Register N0 = MI.getOperand(1).getReg();
+  Register N1 = MI.getOperand(2).getReg();
+  unsigned OpSizeInBits = MRI.getType(N0).getScalarSizeInBits();
+
+  APInt N1C, N001C;
+  if (!mi_match(N1, MRI, m_ICstOrSplat(N1C)))
+    return false;
+  auto N001 = ShiftMI.getOperand(2).getReg();
+  if (!mi_match(N001, MRI, m_ICstOrSplat(N001C)))
+    return false;
+
+  if (N001C.getBitWidth() > N1C.getBitWidth())
+    N1C = N1C.zext(N001C.getBitWidth());
+  else
+    N001C = N001C.zext(N1C.getBitWidth());
+
+  Register InnerShift = ShiftMI.getOperand(0).getReg();
+  LLT InnerShiftTy = MRI.getType(InnerShift);
+  uint64_t InnerShiftSize = InnerShiftTy.getScalarSizeInBits();
+  if ((N1C + N001C).ult(InnerShiftSize)) {
+    MatchInfo.Src = ShiftMI.getOperand(1).getReg();
+    MatchInfo.ShiftAmt = N1C + N001C;
+    MatchInfo.ShiftAmtTy = MRI.getType(N001);
+    MatchInfo.InnerShiftTy = InnerShiftTy;
+
+    if ((N001C + OpSizeInBits) == InnerShiftSize)
+      return true;
+    if (MRI.hasOneUse(N0) && MRI.hasOneUse(InnerShift)) {
+      MatchInfo.Mask = true;
+      MatchInfo.MaskVal = APInt(N1C.getBitWidth(), OpSizeInBits) - N1C;
+      return true;
+    }
+  }
+  return false;
+}
+
+void CombinerHelper::applyLshrOfTruncOfLshr(
+    MachineInstr &MI, LshrOfTruncOfLshr &MatchInfo) const {
+  assert(MI.getOpcode() == TargetOpcode::G_LSHR && "Expected a G_LSHR");
+
+  Register Dst = MI.getOperand(0).getReg();
+  auto ShiftAmt =
+      Builder.buildConstant(MatchInfo.ShiftAmtTy, MatchInfo.ShiftAmt);
+  auto Shift =
+      Builder.buildLShr(MatchInfo.InnerShiftTy, MatchInfo.Src, ShiftAmt);
+  if (MatchInfo.Mask == true) {
+    APInt MaskVal =
+        APInt::getLowBitsSet(MatchInfo.InnerShiftTy.getScalarSizeInBits(),
+                             MatchInfo.MaskVal.getZExtValue());
+    auto Mask = Builder.buildConstant(MatchInfo.InnerShiftTy, MaskVal);
+    auto And = Builder.buildAnd(MatchInfo.InnerShiftTy, Shift, Mask);
+    Builder.buildTrunc(Dst, And);
+  } else
+    Builder.buildTrunc(Dst, Shift);
+  MI.eraseFromParent();
+}
+
 bool CombinerHelper::matchCombineMulToShl(MachineInstr &MI,
                                           unsigned &ShiftVal) const {
   assert(MI.getOpcode() == TargetOpcode::G_MUL && "Expected a G_MUL");
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 008c18837a52..b02465d99a60 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -2916,6 +2916,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
   case TargetOpcode::G_SREM:
   case TargetOpcode::G_SMIN:
   case TargetOpcode::G_SMAX:
+  case TargetOpcode::G_ABDS:
     Observer.changingInstr(MI);
     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
@@ -2953,6 +2954,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     return Legalized;
   case TargetOpcode::G_UDIV:
   case TargetOpcode::G_UREM:
+  case TargetOpcode::G_ABDU:
     Observer.changingInstr(MI);
     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
@@ -4742,6 +4744,16 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
     return lowerShlSat(MI);
   case G_ABS:
     return lowerAbsToAddXor(MI);
+  case G_ABDS:
+  case G_ABDU: {
+    bool IsSigned = MI.getOpcode() == G_ABDS;
+    LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+    if ((IsSigned && LI.isLegal({G_SMIN, Ty}) && LI.isLegal({G_SMAX, Ty})) ||
+        (!IsSigned && LI.isLegal({G_UMIN, Ty}) && LI.isLegal({G_UMAX, Ty}))) {
+      return lowerAbsDiffToMinMax(MI);
+    }
+    return lowerAbsDiffToSelect(MI);
+  }
   case G_FABS:
     return lowerFAbs(MI);
   case G_SELECT:
@@ -4773,6 +4785,16 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
     return lowerVectorReduction(MI);
   case G_VAARG:
     return lowerVAArg(MI);
+  case G_ATOMICRMW_SUB: {
+    auto [Ret, Mem, Val] = MI.getFirst3Regs();
+    const LLT ValTy = MRI.getType(Val);
+    MachineMemOperand *MMO = *MI.memoperands_begin();
+
+    auto VNeg = MIRBuilder.buildNeg(ValTy, Val);
+    MIRBuilder.buildAtomicRMW(G_ATOMICRMW_ADD, Ret, Mem, VNeg, *MMO);
+    MI.eraseFromParent();
+    return Legalized;
+  }
   }
 }
 
@@ -5222,19 +5244,13 @@ LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI,
     InsertVal = MI.getOperand(2).getReg();
 
   Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
-
-  // TODO: Handle total scalarization case.
-  if (!NarrowVecTy.isVector())
-    return UnableToLegalize;
-
   LLT VecTy = MRI.getType(SrcVec);
 
   // If the index is a constant, we can really break this down as you would
   // expect, and index into the target size pieces.
-  int64_t IdxVal;
   auto MaybeCst = getIConstantVRegValWithLookThrough(Idx, MRI);
   if (MaybeCst) {
-    IdxVal = MaybeCst->Value.getSExtValue();
+    uint64_t IdxVal = MaybeCst->Value.getZExtValue();
     // Avoid out of bounds indexing the pieces.
     if (IdxVal >= VecTy.getNumElements()) {
       MIRBuilder.buildUndef(DstReg);
@@ -5242,33 +5258,45 @@ LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI,
       return Legalized;
     }
 
-    SmallVector<Register, 8> VecParts;
-    LLT GCDTy = extractGCDType(VecParts, VecTy, NarrowVecTy, SrcVec);
+    if (!NarrowVecTy.isVector()) {
+      SmallVector<Register, 8> SplitPieces;
+      extractParts(MI.getOperand(1).getReg(), NarrowVecTy,
+                   VecTy.getNumElements(), SplitPieces, MIRBuilder, MRI);
+      if (IsInsert) {
+        SplitPieces[IdxVal] = InsertVal;
+        MIRBuilder.buildMergeLikeInstr(MI.getOperand(0).getReg(), SplitPieces);
+      } else {
+        MIRBuilder.buildCopy(MI.getOperand(0).getReg(), SplitPieces[IdxVal]);
+      }
+    } else {
+      SmallVector<Register, 8> VecParts;
+      LLT GCDTy = extractGCDType(VecParts, VecTy, NarrowVecTy, SrcVec);
 
-    // Build a sequence of NarrowTy pieces in VecParts for this operand.
-    LLT LCMTy = buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts,
-                                    TargetOpcode::G_ANYEXT);
+      // Build a sequence of NarrowTy pieces in VecParts for this operand.
+      LLT LCMTy = buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts,
+                                      TargetOpcode::G_ANYEXT);
 
-    unsigned NewNumElts = NarrowVecTy.getNumElements();
+      unsigned NewNumElts = NarrowVecTy.getNumElements();
 
-    LLT IdxTy = MRI.getType(Idx);
-    int64_t PartIdx = IdxVal / NewNumElts;
-    auto NewIdx =
-        MIRBuilder.buildConstant(IdxTy, IdxVal - NewNumElts * PartIdx);
+      LLT IdxTy = MRI.getType(Idx);
+      int64_t PartIdx = IdxVal / NewNumElts;
+      auto NewIdx =
+          MIRBuilder.buildConstant(IdxTy, IdxVal - NewNumElts * PartIdx);
 
-    if (IsInsert) {
-      LLT PartTy = MRI.getType(VecParts[PartIdx]);
+      if (IsInsert) {
+        LLT PartTy = MRI.getType(VecParts[PartIdx]);
 
-      // Use the adjusted index to insert into one of the subvectors.
-      auto InsertPart = MIRBuilder.buildInsertVectorElement(
-          PartTy, VecParts[PartIdx], InsertVal, NewIdx);
-      VecParts[PartIdx] = InsertPart.getReg(0);
+        // Use the adjusted index to insert into one of the subvectors.
+        auto InsertPart = MIRBuilder.buildInsertVectorElement(
+            PartTy, VecParts[PartIdx], InsertVal, NewIdx);
+        VecParts[PartIdx] = InsertPart.getReg(0);
 
-      // Recombine the inserted subvector with the others to reform the result
-      // vector.
-      buildWidenedRemergeToDst(DstReg, LCMTy, VecParts);
-    } else {
-      MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx);
+        // Recombine the inserted subvector with the others to reform the result
+        // vector.
+        buildWidenedRemergeToDst(DstReg, LCMTy, VecParts);
+      } else {
+        MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx);
+      }
     }
 
     MI.eraseFromParent();
@@ -5970,7 +5998,6 @@ LegalizerHelper::narrowScalarShiftByConstant(MachineInstr &MI, const APInt &Amt,
   return Legalized;
 }
 
-// TODO: Optimize if constant shift amount.
 LegalizerHelper::LegalizeResult
 LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
                                    LLT RequestedTy) {
@@ -5992,6 +6019,27 @@ LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
   if (DstEltSize % 2 != 0)
     return UnableToLegalize;
 
+  // Check if we should use multi-way splitting instead of recursive binary
+  // splitting.
+  //
+  // Multi-way splitting directly decomposes wide shifts (e.g., 128-bit ->
+  // 4×32-bit) in a single legalization step, avoiding the recursive overhead
+  // and dependency chains created by usual binary splitting approach
+  // (128->64->32).
+  //
+  // The >= 8 parts threshold ensures we only use this optimization when binary
+  // splitting would require multiple recursive passes, avoiding overhead for
+  // simple 2-way splits where binary approach is sufficient.
+  if (RequestedTy.isValid() && RequestedTy.isScalar() &&
+      DstEltSize % RequestedTy.getSizeInBits() == 0) {
+    const unsigned NumParts = DstEltSize / RequestedTy.getSizeInBits();
+    // Use multiway if we have 8 or more parts (i.e., would need 3+ recursive
+    // steps).
+    if (NumParts >= 8)
+      return narrowScalarShiftMultiway(MI, RequestedTy);
+  }
+
+  // Fall back to binary splitting:
   // Ignore the input type. We can only go to exactly half the size of the
   // input. If that isn't small enough, the resulting pieces will be further
   // legalized.
@@ -6080,6 +6128,358 @@ LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
   return Legalized;
 }
 
+Register LegalizerHelper::buildConstantShiftPart(unsigned Opcode,
+                                                 unsigned PartIdx,
+                                                 unsigned NumParts,
+                                                 ArrayRef<Register> SrcParts,
+                                                 const ShiftParams &Params,
+                                                 LLT TargetTy, LLT ShiftAmtTy) {
+  auto WordShiftConst = getIConstantVRegVal(Params.WordShift, MRI);
+  auto BitShiftConst = getIConstantVRegVal(Params.BitShift, MRI);
+  assert(WordShiftConst && BitShiftConst && "Expected constants");
+
+  const unsigned ShiftWords = WordShiftConst->getZExtValue();
+  const unsigned ShiftBits = BitShiftConst->getZExtValue();
+  const bool NeedsInterWordShift = ShiftBits != 0;
+
+  switch (Opcode) {
+  case TargetOpcode::G_SHL: {
+    // Data moves from lower indices to higher indices
+    // If this part would come from a source beyond our range, it's zero
+    if (PartIdx < ShiftWords)
+      return Params.Zero;
+
+    unsigned SrcIdx = PartIdx - ShiftWords;
+    if (!NeedsInterWordShift)
+      return SrcParts[SrcIdx];
+
+    // Combine shifted main part with carry from previous part
+    auto Hi = MIRBuilder.buildShl(TargetTy, SrcParts[SrcIdx], Params.BitShift);
+    if (SrcIdx > 0) {
+      auto Lo = MIRBuilder.buildLShr(TargetTy, SrcParts[SrcIdx - 1],
+                                     Params.InvBitShift);
+      return MIRBuilder.buildOr(TargetTy, Hi, Lo).getReg(0);
+    }
+    return Hi.getReg(0);
+  }
+
+  case TargetOpcode::G_LSHR: {
+    unsigned SrcIdx = PartIdx + ShiftWords;
+    if (SrcIdx >= NumParts)
+      return Params.Zero;
+    if (!NeedsInterWordShift)
+      return SrcParts[SrcIdx];
+
+    // Combine shifted main part with carry from next part
+    auto Lo = MIRBuilder.buildLShr(TargetTy, SrcParts[SrcIdx], Params.BitShift);
+    if (SrcIdx + 1 < NumParts) {
+      auto Hi = MIRBuilder.buildShl(TargetTy, SrcParts[SrcIdx + 1],
+                                    Params.InvBitShift);
+      return MIRBuilder.buildOr(TargetTy, Lo, Hi).getReg(0);
+    }
+    return Lo.getReg(0);
+  }
+
+  case TargetOpcode::G_ASHR: {
+    // Like LSHR but preserves sign bit
+    unsigned SrcIdx = PartIdx + ShiftWords;
+    if (SrcIdx >= NumParts)
+      return Params.SignBit;
+    if (!NeedsInterWordShift)
+      return SrcParts[SrcIdx];
+
+    // Only the original MSB part uses arithmetic shift to preserve sign. All
+    // other parts use logical shift since they're just moving data bits.
+    auto Lo =
+        (SrcIdx == NumParts - 1)
+            ? MIRBuilder.buildAShr(TargetTy, SrcParts[SrcIdx], Params.BitShift)
+            : MIRBuilder.buildLShr(TargetTy, SrcParts[SrcIdx], Params.BitShift);
+    Register HiSrc =
+        (SrcIdx + 1 < NumParts) ? SrcParts[SrcIdx + 1] : Params.SignBit;
+    auto Hi = MIRBuilder.buildShl(TargetTy, HiSrc, Params.InvBitShift);
+    return MIRBuilder.buildOr(TargetTy, Lo, Hi).getReg(0);
+  }
+
+  default:
+    llvm_unreachable("not a shift");
+  }
+}
+
+Register LegalizerHelper::buildVariableShiftPart(unsigned Opcode,
+                                                 Register MainOperand,
+                                                 Register ShiftAmt,
+                                                 LLT TargetTy,
+                                                 Register CarryOperand) {
+  // This helper generates a single output part for variable shifts by combining
+  // the main operand (shifted by BitShift) with carry bits from an adjacent
+  // part.
+
+  // For G_ASHR, individual parts don't have their own sign bit, only the
+  // complete value does. So we use LSHR for the main operand shift in ASHR
+  // context.
+  unsigned MainOpcode =
+      (Opcode == TargetOpcode::G_ASHR) ? TargetOpcode::G_LSHR : Opcode;
+
+  // Perform the primary shift on the main operand
+  Register MainShifted =
+      MIRBuilder.buildInstr(MainOpcode, {TargetTy}, {MainOperand, ShiftAmt})
+          .getReg(0);
+
+  // No carry operand available
+  if (!CarryOperand.isValid())
+    return MainShifted;
+
+  // If BitShift is 0 (word-aligned shift), no inter-word bit movement occurs,
+  // so carry bits aren't needed.
+  LLT ShiftAmtTy = MRI.getType(ShiftAmt);
+  auto ZeroConst = MIRBuilder.buildConstant(ShiftAmtTy, 0);
+  LLT BoolTy = LLT::scalar(1);
+  auto IsZeroBitShift =
+      MIRBuilder.buildICmp(ICmpInst::ICMP_EQ, BoolTy, ShiftAmt, ZeroConst);
+
+  // Extract bits from the adjacent part that will "carry over" into this part.
+  // The carry direction is opposite to the main shift direction, so we can
+  // align the two shifted values before combining them with OR.
+
+  // Determine the carry shift opcode (opposite direction)
+  unsigned CarryOpcode = (Opcode == TargetOpcode::G_SHL) ? TargetOpcode::G_LSHR
+                                                         : TargetOpcode::G_SHL;
+
+  // Calculate inverse shift amount: BitWidth - ShiftAmt
+  auto TargetBitsConst =
+      MIRBuilder.buildConstant(ShiftAmtTy, TargetTy.getScalarSizeInBits());
+  auto InvShiftAmt = MIRBuilder.buildSub(ShiftAmtTy, TargetBitsConst, ShiftAmt);
+
+  // Shift the carry operand
+  Register CarryBits =
+      MIRBuilder
+          .buildInstr(CarryOpcode, {TargetTy}, {CarryOperand, InvShiftAmt})
+          .getReg(0);
+
+  // If BitShift is 0, don't include carry bits (InvShiftAmt would equal
+  // TargetBits which would be poison for the individual carry shift operation).
+  auto ZeroReg = MIRBuilder.buildConstant(TargetTy, 0);
+  Register SafeCarryBits =
+      MIRBuilder.buildSelect(TargetTy, IsZeroBitShift, ZeroReg, CarryBits)
+          .getReg(0);
+
+  // Combine the main shifted part with the carry bits
+  return MIRBuilder.buildOr(TargetTy, MainShifted, SafeCarryBits).getReg(0);
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::narrowScalarShiftByConstantMultiway(MachineInstr &MI,
+                                                     const APInt &Amt,
+                                                     LLT TargetTy,
+                                                     LLT ShiftAmtTy) {
+  // Any wide shift can be decomposed into WordShift + BitShift components.
+  // When shift amount is known constant, directly compute the decomposition
+  // values and generate constant registers.
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+
+  const unsigned DstBits = DstTy.getScalarSizeInBits();
+  const unsigned TargetBits = TargetTy.getScalarSizeInBits();
+  const unsigned NumParts = DstBits / TargetBits;
+
+  assert(DstBits % TargetBits == 0 && "Target type must evenly divide source");
+
+  // When the shift amount is known at compile time, we just calculate which
+  // source parts contribute to each output part.
+
+  SmallVector<Register, 8> SrcParts;
+  extractParts(SrcReg, TargetTy, NumParts, SrcParts, MIRBuilder, MRI);
+
+  if (Amt.isZero()) {
+    // No shift needed, just copy
+    MIRBuilder.buildMergeLikeInstr(DstReg, SrcParts);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  ShiftParams Params;
+  const unsigned ShiftWords = Amt.getZExtValue() / TargetBits;
+  const unsigned ShiftBits = Amt.getZExtValue() % TargetBits;
+
+  // Generate constants and values needed by all shift types
+  Params.WordShift = MIRBuilder.buildConstant(ShiftAmtTy, ShiftWords).getReg(0);
+  Params.BitShift = MIRBuilder.buildConstant(ShiftAmtTy, ShiftBits).getReg(0);
+  Params.InvBitShift =
+      MIRBuilder.buildConstant(ShiftAmtTy, TargetBits - ShiftBits).getReg(0);
+  Params.Zero = MIRBuilder.buildConstant(TargetTy, 0).getReg(0);
+
+  // For ASHR, we need the sign-extended value to fill shifted-out positions
+  if (MI.getOpcode() == TargetOpcode::G_ASHR)
+    Params.SignBit =
+        MIRBuilder
+            .buildAShr(TargetTy, SrcParts[SrcParts.size() - 1],
+                       MIRBuilder.buildConstant(ShiftAmtTy, TargetBits - 1))
+            .getReg(0);
+
+  SmallVector<Register, 8> DstParts(NumParts);
+  for (unsigned I = 0; I < NumParts; ++I)
+    DstParts[I] = buildConstantShiftPart(MI.getOpcode(), I, NumParts, SrcParts,
+                                         Params, TargetTy, ShiftAmtTy);
+
+  MIRBuilder.buildMergeLikeInstr(DstReg, DstParts);
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::narrowScalarShiftMultiway(MachineInstr &MI, LLT TargetTy) {
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+  Register AmtReg = MI.getOperand(2).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+  LLT ShiftAmtTy = MRI.getType(AmtReg);
+
+  const unsigned DstBits = DstTy.getScalarSizeInBits();
+  const unsigned TargetBits = TargetTy.getScalarSizeInBits();
+  const unsigned NumParts = DstBits / TargetBits;
+
+  assert(DstBits % TargetBits == 0 && "Target type must evenly divide source");
+  assert(isPowerOf2_32(TargetBits) && "Target bit width must be power of 2");
+
+  // If the shift amount is known at compile time, we can use direct indexing
+  // instead of generating select chains in the general case.
+  if (auto VRegAndVal = getIConstantVRegValWithLookThrough(AmtReg, MRI))
+    return narrowScalarShiftByConstantMultiway(MI, VRegAndVal->Value, TargetTy,
+                                               ShiftAmtTy);
+
+  // For runtime-variable shift amounts, we must generate a more complex
+  // sequence that handles all possible shift values using select chains.
+
+  // Split the input into target-sized pieces
+  SmallVector<Register, 8> SrcParts;
+  extractParts(SrcReg, TargetTy, NumParts, SrcParts, MIRBuilder, MRI);
+
+  // Shifting by zero should be a no-op.
+  auto ZeroAmtConst = MIRBuilder.buildConstant(ShiftAmtTy, 0);
+  LLT BoolTy = LLT::scalar(1);
+  auto IsZeroShift =
+      MIRBuilder.buildICmp(ICmpInst::ICMP_EQ, BoolTy, AmtReg, ZeroAmtConst);
+
+  // Any wide shift can be decomposed into two components:
+  // 1. WordShift: number of complete target-sized words to shift
+  // 2. BitShift: number of bits to shift within each word
+  //
+  // Example: 128-bit >> 50 with 32-bit target:
+  //   WordShift = 50 / 32 = 1 (shift right by 1 complete word)
+  //   BitShift = 50 % 32 = 18 (shift each word right by 18 bits)
+  unsigned TargetBitsLog2 = Log2_32(TargetBits);
+  auto TargetBitsLog2Const =
+      MIRBuilder.buildConstant(ShiftAmtTy, TargetBitsLog2);
+  auto TargetBitsMask = MIRBuilder.buildConstant(ShiftAmtTy, TargetBits - 1);
+
+  Register WordShift =
+      MIRBuilder.buildLShr(ShiftAmtTy, AmtReg, TargetBitsLog2Const).getReg(0);
+  Register BitShift =
+      MIRBuilder.buildAnd(ShiftAmtTy, AmtReg, TargetBitsMask).getReg(0);
+
+  // Fill values:
+  // - SHL/LSHR: fill with zeros
+  // - ASHR: fill with sign-extended MSB
+  Register ZeroReg = MIRBuilder.buildConstant(TargetTy, 0).getReg(0);
+
+  Register FillValue;
+  if (MI.getOpcode() == TargetOpcode::G_ASHR) {
+    auto TargetBitsMinusOneConst =
+        MIRBuilder.buildConstant(ShiftAmtTy, TargetBits - 1);
+    FillValue = MIRBuilder
+                    .buildAShr(TargetTy, SrcParts[NumParts - 1],
+                               TargetBitsMinusOneConst)
+                    .getReg(0);
+  } else {
+    FillValue = ZeroReg;
+  }
+
+  SmallVector<Register, 8> DstParts(NumParts);
+
+  // For each output part, generate a select chain that chooses the correct
+  // result based on the runtime WordShift value. This handles all possible
+  // word shift amounts by pre-calculating what each would produce.
+  for (unsigned I = 0; I < NumParts; ++I) {
+    // Initialize with appropriate default value for this shift type
+    Register InBoundsResult = FillValue;
+
+    // clang-format off
+    // Build a branchless select chain by pre-computing results for all possible
+    // WordShift values (0 to NumParts-1). Each iteration nests a new select:
+    //
+    // K=0: select(WordShift==0, result0, FillValue)
+    // K=1: select(WordShift==1, result1, select(WordShift==0, result0, FillValue))
+    // K=2: select(WordShift==2, result2, select(WordShift==1, result1, select(...)))
+    // clang-format on
+    for (unsigned K = 0; K < NumParts; ++K) {
+      auto WordShiftKConst = MIRBuilder.buildConstant(ShiftAmtTy, K);
+      auto IsWordShiftK = MIRBuilder.buildICmp(ICmpInst::ICMP_EQ, BoolTy,
+                                               WordShift, WordShiftKConst);
+
+      // Calculate source indices for this word shift
+      //
+      // For 4-part 128-bit value with K=1 word shift:
+      // SHL:  [3][2][1][0] << K  =>  [2][1][0][Z]
+      //     -> (MainIdx = I-K, CarryIdx = I-K-1)
+      // LSHR: [3][2][1][0] >> K  =>  [Z][3][2][1]
+      //     -> (MainIdx = I+K, CarryIdx = I+K+1)
+      int MainSrcIdx;
+      int CarrySrcIdx; // Index for the word that provides the carried-in bits.
+
+      switch (MI.getOpcode()) {
+      case TargetOpcode::G_SHL:
+        MainSrcIdx = (int)I - (int)K;
+        CarrySrcIdx = MainSrcIdx - 1;
+        break;
+      case TargetOpcode::G_LSHR:
+      case TargetOpcode::G_ASHR:
+        MainSrcIdx = (int)I + (int)K;
+        CarrySrcIdx = MainSrcIdx + 1;
+        break;
+      default:
+        llvm_unreachable("Not a shift");
+      }
+
+      // Check bounds and build the result for this word shift
+      Register ResultForK;
+      if (MainSrcIdx >= 0 && MainSrcIdx < (int)NumParts) {
+        Register MainOp = SrcParts[MainSrcIdx];
+        Register CarryOp;
+
+        // Determine carry operand with bounds checking
+        if (CarrySrcIdx >= 0 && CarrySrcIdx < (int)NumParts)
+          CarryOp = SrcParts[CarrySrcIdx];
+        else if (MI.getOpcode() == TargetOpcode::G_ASHR &&
+                 CarrySrcIdx >= (int)NumParts)
+          CarryOp = FillValue; // Use sign extension
+
+        ResultForK = buildVariableShiftPart(MI.getOpcode(), MainOp, BitShift,
+                                            TargetTy, CarryOp);
+      } else {
+        // Out of bounds - use fill value for this k
+        ResultForK = FillValue;
+      }
+
+      // Select this result if WordShift equals k
+      InBoundsResult =
+          MIRBuilder
+              .buildSelect(TargetTy, IsWordShiftK, ResultForK, InBoundsResult)
+              .getReg(0);
+    }
+
+    // Handle zero-shift special case: if shift is 0, use original input
+    DstParts[I] =
+        MIRBuilder
+            .buildSelect(TargetTy, IsZeroShift, SrcParts[I], InBoundsResult)
+            .getReg(0);
+  }
+
+  MIRBuilder.buildMergeLikeInstr(DstReg, DstParts);
+  MI.eraseFromParent();
+  return Legalized;
+}
+
 LegalizerHelper::LegalizeResult
 LegalizerHelper::moreElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
                                        LLT MoreTy) {
@@ -9537,6 +9937,54 @@ LegalizerHelper::lowerAbsToCNeg(MachineInstr &MI) {
   return Legalized;
 }
 
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerAbsDiffToSelect(MachineInstr &MI) {
+  assert((MI.getOpcode() == TargetOpcode::G_ABDS ||
+          MI.getOpcode() == TargetOpcode::G_ABDU) &&
+         "Expected G_ABDS or G_ABDU instruction");
+
+  auto [DstReg, LHS, RHS] = MI.getFirst3Regs();
+  LLT Ty = MRI.getType(LHS);
+
+  // abds(lhs, rhs) -> select(sgt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))
+  // abdu(lhs, rhs) -> select(ugt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))
+  Register LHSSub = MIRBuilder.buildSub(Ty, LHS, RHS).getReg(0);
+  Register RHSSub = MIRBuilder.buildSub(Ty, RHS, LHS).getReg(0);
+  CmpInst::Predicate Pred = (MI.getOpcode() == TargetOpcode::G_ABDS)
+                                ? CmpInst::ICMP_SGT
+                                : CmpInst::ICMP_UGT;
+  auto ICmp = MIRBuilder.buildICmp(Pred, LLT::scalar(1), LHS, RHS);
+  MIRBuilder.buildSelect(DstReg, ICmp, LHSSub, RHSSub);
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerAbsDiffToMinMax(MachineInstr &MI) {
+  assert((MI.getOpcode() == TargetOpcode::G_ABDS ||
+          MI.getOpcode() == TargetOpcode::G_ABDU) &&
+         "Expected G_ABDS or G_ABDU instruction");
+
+  auto [DstReg, LHS, RHS] = MI.getFirst3Regs();
+  LLT Ty = MRI.getType(LHS);
+
+  // abds(lhs, rhs) -→ sub(smax(lhs, rhs), smin(lhs, rhs))
+  // abdu(lhs, rhs) -→ sub(umax(lhs, rhs), umin(lhs, rhs))
+  Register MaxReg, MinReg;
+  if (MI.getOpcode() == TargetOpcode::G_ABDS) {
+    MaxReg = MIRBuilder.buildSMax(Ty, LHS, RHS).getReg(0);
+    MinReg = MIRBuilder.buildSMin(Ty, LHS, RHS).getReg(0);
+  } else {
+    MaxReg = MIRBuilder.buildUMax(Ty, LHS, RHS).getReg(0);
+    MinReg = MIRBuilder.buildUMin(Ty, LHS, RHS).getReg(0);
+  }
+  MIRBuilder.buildSub(DstReg, MaxReg, MinReg);
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFAbs(MachineInstr &MI) {
   Register SrcReg = MI.getOperand(1).getReg();
   Register DstReg = MI.getOperand(0).getReg();
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index e41fd81953f4..58d631e569b3 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -466,8 +466,14 @@ llvm::getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI) {
 std::optional<DefinitionAndSourceRegister>
 llvm::getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI) {
   Register DefSrcReg = Reg;
-  auto *DefMI = MRI.getVRegDef(Reg);
-  auto DstTy = MRI.getType(DefMI->getOperand(0).getReg());
+  // This assumes that the code is in SSA form, so there should only be one
+  // definition.
+  auto DefIt = MRI.def_begin(Reg);
+  if (DefIt == MRI.def_end())
+    return {};
+  MachineOperand &DefOpnd = *DefIt;
+  MachineInstr *DefMI = DefOpnd.getParent();
+  auto DstTy = MRI.getType(DefOpnd.getReg());
   if (!DstTy.isValid())
     return std::nullopt;
   unsigned Opc = DefMI->getOpcode();
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 93f6e39b56ab..e3ded12a1847 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -537,28 +537,26 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
          "number of stored element should be a multiple of Factor");
 
   Value *Mask = nullptr;
+  auto GapMask = APInt::getAllOnes(Factor);
   if (SI) {
     LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *Store << "\n");
   } else {
     // Check mask operand. Handle both all-true/false and interleaved mask.
     unsigned LaneMaskLen = NumStoredElements / Factor;
-    APInt GapMask(Factor, 0);
     std::tie(Mask, GapMask) = getMask(getMaskOperand(II), Factor,
                                       ElementCount::getFixed(LaneMaskLen));
     if (!Mask)
       return false;
-    // We haven't supported gap mask for stores. Yet it is possible that we
-    // already changed the IR, hence returning true here.
-    if (GapMask.popcount() != Factor)
-      return true;
 
     LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store or masked.store: "
                       << *Store << "\n");
+    LLVM_DEBUG(dbgs() << "IA: With nominal factor " << Factor
+                      << " and actual factor " << GapMask.popcount() << "\n");
   }
 
   // Try to create target specific intrinsics to replace the store and
   // shuffle.
-  if (!TLI->lowerInterleavedStore(Store, Mask, SVI, Factor))
+  if (!TLI->lowerInterleavedStore(Store, Mask, SVI, Factor, GapMask))
     return false;
 
   // Already have a new target specific interleaved store. Erase the old store.
@@ -662,6 +660,10 @@ static std::pair<Value *, APInt> getMask(Value *WideMask, unsigned Factor,
   }
 
   if (auto *SVI = dyn_cast<ShuffleVectorInst>(WideMask)) {
+    Type *Op1Ty = SVI->getOperand(1)->getType();
+    if (!isa<FixedVectorType>(Op1Ty))
+      return {nullptr, GapMask};
+
     // Check that the shuffle mask is: a) an interleave, b) all of the same
     // set of the elements, and c) contained by the first source.  (c) could
     // be relaxed if desired.
diff --git a/llvm/lib/CodeGen/LiveDebugVariables.cpp b/llvm/lib/CodeGen/LiveDebugVariables.cpp
index f12f437c493e..9d98e6c085fe 100644
--- a/llvm/lib/CodeGen/LiveDebugVariables.cpp
+++ b/llvm/lib/CodeGen/LiveDebugVariables.cpp
@@ -536,12 +536,6 @@ public:
 
 namespace llvm {
 
-/// Implementation of the LiveDebugVariables pass.
-
-LiveDebugVariables::LiveDebugVariables() = default;
-LiveDebugVariables::~LiveDebugVariables() = default;
-LiveDebugVariables::LiveDebugVariables(LiveDebugVariables &&) = default;
-
 class LiveDebugVariables::LDVImpl {
   LocMap::Allocator allocator;
   MachineFunction *MF = nullptr;
@@ -683,6 +677,12 @@ public:
   void print(raw_ostream&);
 };
 
+/// Implementation of the LiveDebugVariables pass.
+
+LiveDebugVariables::LiveDebugVariables() = default;
+LiveDebugVariables::~LiveDebugVariables() = default;
+LiveDebugVariables::LiveDebugVariables(LiveDebugVariables &&) = default;
+
 } // namespace llvm
 
 static void printDebugLoc(const DebugLoc &DL, raw_ostream &CommentOS,
diff --git a/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp b/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp
index 116a919585d7..17a7f48e3f2e 100644
--- a/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp
@@ -21,6 +21,10 @@ using namespace llvm;
 
 AnalysisKey MachineFunctionAnalysis::Key;
 
+llvm::MachineFunctionAnalysis::Result::Result(
+    std::unique_ptr<MachineFunction> MF)
+    : MF(std::move(MF)) {}
+
 bool MachineFunctionAnalysis::Result::invalidate(
     Function &, const PreservedAnalyses &PA,
     FunctionAnalysisManager::Invalidator &) {
diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp
index d9e8484c08d7..da29ffc9d2fe 100644
--- a/llvm/lib/CodeGen/MachineInstrBundle.cpp
+++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp
@@ -133,7 +133,6 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
   SmallSetVector<Register, 32> LocalDefs;
   BitVector LocalDefsP(TRI->getNumRegUnits());
   SmallSet<Register, 8> DeadDefSet;
-  SmallSet<Register, 16> KilledDefSet;
   SmallSetVector<Register, 8> ExternUses;
   SmallSet<Register, 8> KilledUseSet;
   SmallSet<Register, 8> UndefUseSet;
@@ -151,7 +150,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
         MO.setIsInternalRead();
         if (MO.isKill()) {
           // Internal def is now killed.
-          KilledDefSet.insert(Reg);
+          DeadDefSet.insert(Reg);
         }
       } else {
         if (ExternUses.insert(Reg)) {
@@ -171,21 +170,18 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
         continue;
 
       if (LocalDefs.insert(Reg)) {
-        if (MO.isDead())
-          DeadDefSet.insert(Reg);
+        if (!MO.isDead() && Reg.isPhysical()) {
+          for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
+            LocalDefsP.set(Unit);
+        }
       } else {
-        // Re-defined inside the bundle, it's no longer killed.
-        KilledDefSet.erase(Reg);
         if (!MO.isDead()) {
-          // Previously defined but dead.
+          // Re-defined inside the bundle, it's no longer dead.
           DeadDefSet.erase(Reg);
         }
       }
-
-      if (!MO.isDead() && Reg.isPhysical()) {
-        for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
-          LocalDefsP.set(Unit);
-      }
+      if (MO.isDead())
+        DeadDefSet.insert(Reg);
     }
 
     // Set FrameSetup/FrameDestroy for the bundle. If any of the instructions
@@ -198,7 +194,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
 
   for (Register Reg : LocalDefs) {
     // If it's not live beyond end of the bundle, mark it dead.
-    bool isDead = DeadDefSet.contains(Reg) || KilledDefSet.contains(Reg);
+    bool isDead = DeadDefSet.contains(Reg);
     MIB.addReg(Reg, getDefRegState(true) | getDeadRegState(isDead) |
                         getImplRegState(true));
   }
diff --git a/llvm/lib/CodeGen/MachineOutliner.cpp b/llvm/lib/CodeGen/MachineOutliner.cpp
index b0bce2c21a47..fdae3b470de0 100644
--- a/llvm/lib/CodeGen/MachineOutliner.cpp
+++ b/llvm/lib/CodeGen/MachineOutliner.cpp
@@ -59,8 +59,10 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/CGData/CodeGenDataReader.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
@@ -107,6 +109,16 @@ STATISTIC(StableHashAttempts,
 STATISTIC(StableHashDropped,
           "Count of unsuccessful hashing attempts for outlined functions");
 STATISTIC(NumRemovedLOHs, "Total number of Linker Optimization Hints removed");
+STATISTIC(NumPGOBlockedOutlined,
+          "Number of times outlining was blocked by PGO");
+STATISTIC(NumPGOAllowedCold,
+          "Number of times outlining was allowed from cold functions");
+STATISTIC(NumPGOConservativeBlockedOutlined,
+          "Number of times outlining was blocked conservatively when profile "
+          "counts were missing");
+STATISTIC(NumPGOOptimisticOutlined,
+          "Number of times outlining was allowed optimistically when profile "
+          "counts were missing");
 
 // Set to true if the user wants the outliner to run on linkonceodr linkage
 // functions. This is false by default because the linker can dedupe linkonceodr
@@ -438,11 +450,10 @@ struct MachineOutliner : public ModulePass {
   /// The current repeat number of machine outlining.
   unsigned OutlineRepeatedNum = 0;
 
-  /// Set to true if the outliner should run on all functions in the module
-  /// considered safe for outlining.
-  /// Set to true by default for compatibility with llc's -run-pass option.
-  /// Set when the pass is constructed in TargetPassConfig.
-  bool RunOnAllFunctions = true;
+  /// The mode for whether to run the outliner
+  /// Set to always-outline by default for compatibility with llc's -run-pass
+  /// option.
+  RunOutliner RunOutlinerMode = RunOutliner::AlwaysOutline;
 
   /// This is a compact representation of hash sequences of outlined functions.
   /// It is used when OutlinerMode = CGDataMode::Write.
@@ -468,6 +479,11 @@ struct MachineOutliner : public ModulePass {
     AU.addRequired<TargetPassConfig>();
     AU.addPreserved<MachineModuleInfoWrapperPass>();
     AU.addUsedIfAvailable<ImmutableModuleSummaryIndexWrapperPass>();
+    if (RunOutlinerMode == RunOutliner::OptimisticPGO ||
+        RunOutlinerMode == RunOutliner::ConservativePGO) {
+      AU.addRequired<BlockFrequencyInfoWrapperPass>();
+      AU.addRequired<ProfileSummaryInfoWrapperPass>();
+    }
     AU.setPreservesAll();
     ModulePass::getAnalysisUsage(AU);
   }
@@ -578,9 +594,9 @@ struct MachineOutliner : public ModulePass {
 char MachineOutliner::ID = 0;
 
 namespace llvm {
-ModulePass *createMachineOutlinerPass(bool RunOnAllFunctions) {
+ModulePass *createMachineOutlinerPass(RunOutliner RunOutlinerMode) {
   MachineOutliner *OL = new MachineOutliner();
-  OL->RunOnAllFunctions = RunOnAllFunctions;
+  OL->RunOutlinerMode = RunOutlinerMode;
   return OL;
 }
 
@@ -1017,9 +1033,6 @@ MachineFunction *MachineOutliner::createOutlinedFunction(
         /* Outlined code is optimized code by definition. */
         DISubprogram::SPFlagDefinition | DISubprogram::SPFlagOptimized);
 
-    // Don't add any new variables to the subprogram.
-    DB.finalizeSubprogram(OutlinedSP);
-
     // Attach subprogram to the function.
     F->setSubprogram(OutlinedSP);
     // We're done with the DIBuilder.
@@ -1201,10 +1214,49 @@ bool MachineOutliner::outline(
   return OutlinedSomething;
 }
 
+static bool allowPGOOutlining(RunOutliner RunOutlinerMode,
+                              const ProfileSummaryInfo *PSI,
+                              const BlockFrequencyInfo *BFI,
+                              MachineBasicBlock &MBB) {
+  if (RunOutlinerMode != RunOutliner::OptimisticPGO &&
+      RunOutlinerMode != RunOutliner::ConservativePGO)
+    return true;
+  auto *MF = MBB.getParent();
+  if (MF->getFunction().hasFnAttribute(Attribute::Cold)) {
+    ++NumPGOAllowedCold;
+    return true;
+  }
+
+  auto *BB = MBB.getBasicBlock();
+  if (BB && PSI && BFI)
+    if (auto Count = BFI->getBlockProfileCount(BB))
+      return *Count <= PSI->getOrCompColdCountThreshold();
+
+  if (RunOutlinerMode == RunOutliner::OptimisticPGO) {
+    auto *TII = MF->getSubtarget().getInstrInfo();
+    if (TII->shouldOutlineFromFunctionByDefault(*MF)) {
+      // Profile data is unavailable, but we optimistically allow outlining
+      ++NumPGOOptimisticOutlined;
+      return true;
+    }
+    return false;
+  }
+  assert(RunOutlinerMode == RunOutliner::ConservativePGO);
+  // Profile data is unavailable, so we conservatively block outlining
+  ++NumPGOConservativeBlockedOutlined;
+  return false;
+}
+
 void MachineOutliner::populateMapper(InstructionMapper &Mapper, Module &M) {
   // Build instruction mappings for each function in the module. Start by
   // iterating over each Function in M.
   LLVM_DEBUG(dbgs() << "*** Populating mapper ***\n");
+  bool EnableProfileGuidedOutlining =
+      RunOutlinerMode == RunOutliner::OptimisticPGO ||
+      RunOutlinerMode == RunOutliner::ConservativePGO;
+  ProfileSummaryInfo *PSI = nullptr;
+  if (EnableProfileGuidedOutlining)
+    PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
   for (Function &F : M) {
     LLVM_DEBUG(dbgs() << "MAPPING FUNCTION: " << F.getName() << "\n");
 
@@ -1225,7 +1277,11 @@ void MachineOutliner::populateMapper(InstructionMapper &Mapper, Module &M) {
     }
 
     const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
-    if (!RunOnAllFunctions && !TII->shouldOutlineFromFunctionByDefault(*MF)) {
+    BlockFrequencyInfo *BFI = nullptr;
+    if (EnableProfileGuidedOutlining && F.hasProfileData())
+      BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
+    if (RunOutlinerMode == RunOutliner::TargetDefault &&
+        !TII->shouldOutlineFromFunctionByDefault(*MF)) {
       LLVM_DEBUG(dbgs() << "SKIP: Target does not want to outline from "
                            "function by default\n");
       continue;
@@ -1265,6 +1321,11 @@ void MachineOutliner::populateMapper(InstructionMapper &Mapper, Module &M) {
         continue;
       }
 
+      if (!allowPGOOutlining(RunOutlinerMode, PSI, BFI, MBB)) {
+        ++NumPGOBlockedOutlined;
+        continue;
+      }
+
       // MBB is suitable for outlining. Map it to a list of unsigneds.
       Mapper.convertToUnsignedVec(MBB, *TII);
     }
@@ -1437,10 +1498,22 @@ bool MachineOutliner::doOutline(Module &M, unsigned &OutlinedFunctionNum) {
   // the user how the outliner is running.
   LLVM_DEBUG({
     dbgs() << "Machine Outliner: Running on ";
-    if (RunOnAllFunctions)
+    switch (RunOutlinerMode) {
+    case RunOutliner::AlwaysOutline:
       dbgs() << "all functions";
-    else
+      break;
+    case RunOutliner::OptimisticPGO:
+      dbgs() << "optimistically cold functions";
+      break;
+    case RunOutliner::ConservativePGO:
+      dbgs() << "conservatively cold functions";
+      break;
+    case RunOutliner::TargetDefault:
       dbgs() << "target-default functions";
+      break;
+    case RunOutliner::NeverOutline:
+      llvm_unreachable("should not outline");
+    }
     dbgs() << "\n";
   });
 
diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index b7135251781a..abb3f3e61200 100644
--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -432,6 +432,11 @@ bool MachineRegisterInfo::hasOneNonDBGUser(Register RegNo) const {
   return hasSingleElement(use_nodbg_instructions(RegNo));
 }
 
+MachineOperand *MachineRegisterInfo::getOneNonDBGUse(Register RegNo) const {
+  auto RegNoDbgUses = use_nodbg_operands(RegNo);
+  return hasSingleElement(RegNoDbgUses) ? &*RegNoDbgUses.begin() : nullptr;
+}
+
 MachineInstr *MachineRegisterInfo::getOneNonDBGUser(Register RegNo) const {
   auto RegNoDbgUsers = use_nodbg_instructions(RegNo);
   return hasSingleElement(RegNoDbgUsers) ? &*RegNoDbgUsers.begin() : nullptr;
diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index 96c9cde622b4..f54e2f264556 100644
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -507,83 +507,86 @@ bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const {
       });
       break;
     case Intrinsic::objc_autorelease:
-      Changed |= lowerObjCCall(F, RTLIB::objc_autorelease);
+      Changed |= lowerObjCCall(F, RTLIB::impl_objc_autorelease);
       break;
     case Intrinsic::objc_autoreleasePoolPop:
-      Changed |= lowerObjCCall(F, RTLIB::objc_autoreleasePoolPop);
+      Changed |= lowerObjCCall(F, RTLIB::impl_objc_autoreleasePoolPop);
       break;
     case Intrinsic::objc_autoreleasePoolPush:
-      Changed |= lowerObjCCall(F, RTLIB::objc_autoreleasePoolPush);
+      Changed |= lowerObjCCall(F, RTLIB::impl_objc_autoreleasePoolPush);
       break;
     case Intrinsic::objc_autoreleaseReturnValue:
-      Changed |= lowerObjCCall(F, RTLIB::objc_autoreleaseReturnValue);
+      Changed |= lowerObjCCall(F, RTLIB::impl_objc_autoreleaseReturnValue);
       break;
     case Intrinsic::objc_copyWeak:
-      Changed |= lowerObjCCall(F, RTLIB::objc_copyWeak);
+      Changed |= lowerObjCCall(F, RTLIB::impl_objc_copyWeak);
       break;
     case Intrinsic::objc_destroyWeak:
-      Changed |= lowerObjCCall(F, RTLIB::objc_destroyWeak);
+      Changed |= lowerObjCCall(F, RTLIB::impl_objc_destroyWeak);
       break;
     case Intrinsic::objc_initWeak:
-      Changed |= lowerObjCCall(F, RTLIB::objc_initWeak);
+      Changed |= lowerObjCCall(F, RTLIB::impl_objc_initWeak);
       break;
     case Intrinsic::objc_loadWeak:
-      Changed |= lowerObjCCall(F, RTLIB::objc_loadWeak);
+      Changed |= lowerObjCCall(F, RTLIB::impl_objc_loadWeak);
       break;
     case Intrinsic::objc_loadWeakRetained:
-      Changed |= lowerObjCCall(F, RTLIB::objc_loadWeakRetained);
+      Changed |= lowerObjCCall(F, RTLIB::impl_objc_loadWeakRetained);
       break;
     case Intrinsic::objc_moveWeak:
-      Changed |= lowerObjCCall(F, RTLIB::objc_moveWeak);
+      Changed |= lowerObjCCall(F, RTLIB::impl_objc_moveWeak);
       break;
     case Intrinsic::objc_release:
-      Changed |= lowerObjCCall(F, RTLIB::objc_release, true);
+      Changed |= lowerObjCCall(F, RTLIB::impl_objc_release, true);
       break;
     case Intrinsic::objc_retain:
-      Changed |= lowerObjCCall(F, RTLIB::objc_retain, true);
+      Changed |= lowerObjCCall(F, RTLIB::impl_objc_retain, true);
       break;
     case Intrinsic::objc_retainAutorelease:
-      Changed |= lowerObjCCall(F, RTLIB::objc_retainAutorelease);
+      Changed |= lowerObjCCall(F, RTLIB::impl_objc_retainAutorelease);
       break;
     case Intrinsic::objc_retainAutoreleaseReturnValue:
-      Changed |= lowerObjCCall(F, RTLIB::objc_retainAutoreleaseReturnValue);
+      Changed |=
+          lowerObjCCall(F, RTLIB::impl_objc_retainAutoreleaseReturnValue);
       break;
     case Intrinsic::objc_retainAutoreleasedReturnValue:
-      Changed |= lowerObjCCall(F, RTLIB::objc_retainAutoreleasedReturnValue);
+      Changed |=
+          lowerObjCCall(F, RTLIB::impl_objc_retainAutoreleasedReturnValue);
       break;
     case Intrinsic::objc_claimAutoreleasedReturnValue:
-      Changed |= lowerObjCCall(F, RTLIB::objc_claimAutoreleasedReturnValue);
+      Changed |=
+          lowerObjCCall(F, RTLIB::impl_objc_claimAutoreleasedReturnValue);
       break;
     case Intrinsic::objc_retainBlock:
-      Changed |= lowerObjCCall(F, RTLIB::objc_retainBlock);
+      Changed |= lowerObjCCall(F, RTLIB::impl_objc_retainBlock);
       break;
     case Intrinsic::objc_storeStrong:
-      Changed |= lowerObjCCall(F, RTLIB::objc_storeStrong);
+      Changed |= lowerObjCCall(F, RTLIB::impl_objc_storeStrong);
       break;
     case Intrinsic::objc_storeWeak:
-      Changed |= lowerObjCCall(F, RTLIB::objc_storeWeak);
+      Changed |= lowerObjCCall(F, RTLIB::impl_objc_storeWeak);
       break;
     case Intrinsic::objc_unsafeClaimAutoreleasedReturnValue:
       Changed |=
-          lowerObjCCall(F, RTLIB::objc_unsafeClaimAutoreleasedReturnValue);
+          lowerObjCCall(F, RTLIB::impl_objc_unsafeClaimAutoreleasedReturnValue);
       break;
     case Intrinsic::objc_retainedObject:
-      Changed |= lowerObjCCall(F, RTLIB::objc_retainedObject);
+      Changed |= lowerObjCCall(F, RTLIB::impl_objc_retainedObject);
       break;
     case Intrinsic::objc_unretainedObject:
-      Changed |= lowerObjCCall(F, RTLIB::objc_unretainedObject);
+      Changed |= lowerObjCCall(F, RTLIB::impl_objc_unretainedObject);
       break;
     case Intrinsic::objc_unretainedPointer:
-      Changed |= lowerObjCCall(F, RTLIB::objc_unretainedPointer);
+      Changed |= lowerObjCCall(F, RTLIB::impl_objc_unretainedPointer);
       break;
     case Intrinsic::objc_retain_autorelease:
-      Changed |= lowerObjCCall(F, RTLIB::objc_retain_autorelease);
+      Changed |= lowerObjCCall(F, RTLIB::impl_objc_retain_autorelease);
       break;
     case Intrinsic::objc_sync_enter:
-      Changed |= lowerObjCCall(F, RTLIB::objc_sync_enter);
+      Changed |= lowerObjCCall(F, RTLIB::impl_objc_sync_enter);
       break;
     case Intrinsic::objc_sync_exit:
-      Changed |= lowerObjCCall(F, RTLIB::objc_sync_exit);
+      Changed |= lowerObjCCall(F, RTLIB::impl_objc_sync_exit);
       break;
     case Intrinsic::exp:
     case Intrinsic::exp2:
diff --git a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
index 415674231b5c..a589ef761dd7 100644
--- a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
+++ b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
@@ -275,7 +275,6 @@ void ReachingDefAnalysis::printAllReachingDefs(MachineFunction &MF) {
 
 bool ReachingDefAnalysis::runOnMachineFunction(MachineFunction &mf) {
   MF = &mf;
-  TRI = MF->getSubtarget().getRegisterInfo();
   const TargetSubtargetInfo &STI = MF->getSubtarget();
   TRI = STI.getRegisterInfo();
   TII = STI.getInstrInfo();
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 27b5a0d37b67..d130efe96b56 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -4710,7 +4710,10 @@ template <class MatchContextClass> SDValue DAGCombiner::visitMUL(SDNode *N) {
     if (SDValue LogBase2 = BuildLogBase2(N1, DL)) {
       EVT ShiftVT = getShiftAmountTy(N0.getValueType());
       SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT);
-      return Matcher.getNode(ISD::SHL, DL, VT, N0, Trunc);
+      SDNodeFlags Flags;
+      Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap());
+      // TODO: Preserve setNoSignedWrap if LogBase2 isn't BitWidth - 1.
+      return Matcher.getNode(ISD::SHL, DL, VT, N0, Trunc, Flags);
     }
   }
 
@@ -9998,13 +10001,16 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
     }
   }
 
-  // fold (not (neg x)) -> (add X, -1)
-  // FIXME: This can be generalized to (not (sub Y, X)) -> (add X, ~Y) if
-  // Y is a constant or the subtract has a single use.
-  if (isAllOnesConstant(N1) && N0.getOpcode() == ISD::SUB &&
-      isNullConstant(N0.getOperand(0))) {
-    return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(1),
-                       DAG.getAllOnesConstant(DL, VT));
+  // fold (not (sub Y, X)) -> (add X, ~Y) if Y is a constant
+  if (N0.getOpcode() == ISD::SUB && isAllOnesConstant(N1)) {
+    SDValue Y = N0.getOperand(0);
+    SDValue X = N0.getOperand(1);
+
+    if (auto *YConst = dyn_cast<ConstantSDNode>(Y)) {
+      APInt NotYValue = ~YConst->getAPIntValue();
+      SDValue NotY = DAG.getConstant(NotYValue, DL, VT);
+      return DAG.getNode(ISD::ADD, DL, VT, X, NotY, N->getFlags());
+    }
   }
 
   // fold (not (add X, -1)) -> (neg X)
@@ -11089,38 +11095,43 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
     }
   }
 
-  // fold (srl (shl x, c1), c2) -> (and (shl x, (sub c1, c2), MASK) or
-  //                               (and (srl x, (sub c2, c1), MASK)
-  if (N0.getOpcode() == ISD::SHL &&
-      (N0.getOperand(1) == N1 || N0->hasOneUse()) &&
-      TLI.shouldFoldConstantShiftPairToMask(N, Level)) {
-    auto MatchShiftAmount = [OpSizeInBits](ConstantSDNode *LHS,
-                                           ConstantSDNode *RHS) {
-      const APInt &LHSC = LHS->getAPIntValue();
-      const APInt &RHSC = RHS->getAPIntValue();
-      return LHSC.ult(OpSizeInBits) && RHSC.ult(OpSizeInBits) &&
-             LHSC.getZExtValue() <= RHSC.getZExtValue();
-    };
-    if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchShiftAmount,
-                                  /*AllowUndefs*/ false,
-                                  /*AllowTypeMismatch*/ true)) {
-      SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
-      SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N01, N1);
-      SDValue Mask = DAG.getAllOnesConstant(DL, VT);
-      Mask = DAG.getNode(ISD::SRL, DL, VT, Mask, N01);
-      Mask = DAG.getNode(ISD::SHL, DL, VT, Mask, Diff);
-      SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Diff);
-      return DAG.getNode(ISD::AND, DL, VT, Shift, Mask);
-    }
-    if (ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchShiftAmount,
-                                  /*AllowUndefs*/ false,
-                                  /*AllowTypeMismatch*/ true)) {
-      SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
-      SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N1, N01);
-      SDValue Mask = DAG.getAllOnesConstant(DL, VT);
-      Mask = DAG.getNode(ISD::SRL, DL, VT, Mask, N1);
-      SDValue Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Diff);
-      return DAG.getNode(ISD::AND, DL, VT, Shift, Mask);
+  if (N0.getOpcode() == ISD::SHL) {
+    // fold (srl (shl nuw x, c), c) -> x
+    if (N0.getOperand(1) == N1 && N0->getFlags().hasNoUnsignedWrap())
+      return N0.getOperand(0);
+
+    // fold (srl (shl x, c1), c2) -> (and (shl x, (sub c1, c2), MASK) or
+    //                               (and (srl x, (sub c2, c1), MASK)
+    if ((N0.getOperand(1) == N1 || N0->hasOneUse()) &&
+        TLI.shouldFoldConstantShiftPairToMask(N, Level)) {
+      auto MatchShiftAmount = [OpSizeInBits](ConstantSDNode *LHS,
+                                             ConstantSDNode *RHS) {
+        const APInt &LHSC = LHS->getAPIntValue();
+        const APInt &RHSC = RHS->getAPIntValue();
+        return LHSC.ult(OpSizeInBits) && RHSC.ult(OpSizeInBits) &&
+               LHSC.getZExtValue() <= RHSC.getZExtValue();
+      };
+      if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchShiftAmount,
+                                    /*AllowUndefs*/ false,
+                                    /*AllowTypeMismatch*/ true)) {
+        SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
+        SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N01, N1);
+        SDValue Mask = DAG.getAllOnesConstant(DL, VT);
+        Mask = DAG.getNode(ISD::SRL, DL, VT, Mask, N01);
+        Mask = DAG.getNode(ISD::SHL, DL, VT, Mask, Diff);
+        SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Diff);
+        return DAG.getNode(ISD::AND, DL, VT, Shift, Mask);
+      }
+      if (ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchShiftAmount,
+                                    /*AllowUndefs*/ false,
+                                    /*AllowTypeMismatch*/ true)) {
+        SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
+        SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N1, N01);
+        SDValue Mask = DAG.getAllOnesConstant(DL, VT);
+        Mask = DAG.getNode(ISD::SRL, DL, VT, Mask, N1);
+        SDValue Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Diff);
+        return DAG.getNode(ISD::AND, DL, VT, Shift, Mask);
+      }
     }
   }
 
@@ -15137,7 +15148,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
       return foldedExt;
   } else if (ISD::isNON_EXTLoad(N0.getNode()) &&
              ISD::isUNINDEXEDLoad(N0.getNode()) &&
-             TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) {
+             TLI.isLoadExtLegalOrCustom(ISD::EXTLOAD, VT, N0.getValueType())) {
     bool DoXform = true;
     SmallVector<SDNode *, 4> SetCCs;
     if (!N0.hasOneUse())
@@ -16309,7 +16320,15 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
       if (VT.isScalarInteger() || TLI.isOperationLegal(N0.getOpcode(), VT)) {
         SDValue NarrowL = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0));
         SDValue NarrowR = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(1));
-        return DAG.getNode(N0.getOpcode(), DL, VT, NarrowL, NarrowR);
+        SDNodeFlags Flags;
+        // Propagate nuw for sub.
+        if (N0->getOpcode() == ISD::SUB && N0->getFlags().hasNoUnsignedWrap() &&
+            DAG.MaskedValueIsZero(
+                N0->getOperand(0),
+                APInt::getBitsSetFrom(SrcVT.getScalarSizeInBits(),
+                                      VT.getScalarSizeInBits())))
+          Flags.setNoUnsignedWrap(true);
+        return DAG.getNode(N0.getOpcode(), DL, VT, NarrowL, NarrowR, Flags);
       }
     }
     break;
@@ -16788,6 +16807,8 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
   // If we have frozen and unfrozen users of N0, update so everything uses N.
   if (!N0.isUndef() && !N0.hasOneUse()) {
     SDValue FrozenN0(N, 0);
+    // Unfreeze all uses of N to avoid double deleting N from the CSE map.
+    DAG.ReplaceAllUsesOfValueWith(FrozenN0, N0);
     DAG.ReplaceAllUsesOfValueWith(N0, FrozenN0);
     // ReplaceAllUsesOfValueWith will have also updated the use in N, thus
     // creating a cycle in a DAG. Let's undo that by mutating the freeze.
@@ -19346,13 +19367,13 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) {
   // MachineBasicBlock CFG, which is awkward.
 
   // fold a brcond with a setcc condition into a BR_CC node if BR_CC is legal
-  // on the target.
+  // on the target, also copy fast math flags.
   if (N1.getOpcode() == ISD::SETCC &&
       TLI.isOperationLegalOrCustom(ISD::BR_CC,
                                    N1.getOperand(0).getValueType())) {
-    return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other,
-                       Chain, N1.getOperand(2),
-                       N1.getOperand(0), N1.getOperand(1), N2);
+    return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other, Chain,
+                       N1.getOperand(2), N1.getOperand(0), N1.getOperand(1), N2,
+                       N1->getFlags());
   }
 
   if (N1.hasOneUse()) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 1a63518ab37a..861f76e93f2c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -238,7 +238,7 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node,
 
     // Create the result registers for this node and add the result regs to
     // the machine instruction.
-    if (VRBase == 0) {
+    if (!VRBase) {
       assert(RC && "Isn't a register operand!");
       VRBase = MRI->createVirtualRegister(RC);
       MIB.addReg(VRBase, RegState::Define);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 90d62e6da8e9..9e85f08abb76 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -324,6 +324,11 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
     Res = PromoteIntRes_VP_REDUCE(N);
     break;
 
+  case ISD::LOOP_DEPENDENCE_WAR_MASK:
+  case ISD::LOOP_DEPENDENCE_RAW_MASK:
+    Res = PromoteIntRes_LOOP_DEPENDENCE_MASK(N);
+    break;
+
   case ISD::FREEZE:
     Res = PromoteIntRes_FREEZE(N);
     break;
@@ -374,6 +379,12 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MERGE_VALUES(SDNode *N,
   return GetPromotedInteger(Op);
 }
 
+SDValue DAGTypeLegalizer::PromoteIntRes_LOOP_DEPENDENCE_MASK(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  EVT NewVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  return DAG.getNode(N->getOpcode(), SDLoc(N), NewVT, N->ops());
+}
+
 SDValue DAGTypeLegalizer::PromoteIntRes_AssertSext(SDNode *N) {
   // Sign-extend the new bits, and continue the assertion.
   SDValue Op = SExtPromotedInteger(N->getOperand(0));
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 65fd863e55ac..586c3411791f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -382,6 +382,7 @@ private:
   SDValue PromoteIntRes_VECTOR_FIND_LAST_ACTIVE(SDNode *N);
   SDValue PromoteIntRes_GET_ACTIVE_LANE_MASK(SDNode *N);
   SDValue PromoteIntRes_PARTIAL_REDUCE_MLA(SDNode *N);
+  SDValue PromoteIntRes_LOOP_DEPENDENCE_MASK(SDNode *N);
 
   // Integer Operand Promotion.
   bool PromoteIntegerOperand(SDNode *N, unsigned OpNo);
@@ -436,6 +437,7 @@ private:
   SDValue PromoteIntOp_VECTOR_FIND_LAST_ACTIVE(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_GET_ACTIVE_LANE_MASK(SDNode *N);
   SDValue PromoteIntOp_PARTIAL_REDUCE_MLA(SDNode *N);
+  SDValue PromoteIntOp_LOOP_DEPENDENCE_MASK(SDNode *N, unsigned OpNo);
 
   void SExtOrZExtPromotedOperands(SDValue &LHS, SDValue &RHS);
   void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code);
@@ -868,6 +870,7 @@ private:
   // Vector Result Scalarization: <1 x ty> -> ty.
   void ScalarizeVectorResult(SDNode *N, unsigned ResNo);
   SDValue ScalarizeVecRes_MERGE_VALUES(SDNode *N, unsigned ResNo);
+  SDValue ScalarizeVecRes_LOOP_DEPENDENCE_MASK(SDNode *N);
   SDValue ScalarizeVecRes_BinOp(SDNode *N);
   SDValue ScalarizeVecRes_CMP(SDNode *N);
   SDValue ScalarizeVecRes_TernaryOp(SDNode *N);
@@ -964,6 +967,7 @@ private:
   void SplitVecRes_FIX(SDNode *N, SDValue &Lo, SDValue &Hi);
 
   void SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_LOOP_DEPENDENCE_MASK(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_EXTRACT_SUBVECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -1070,6 +1074,7 @@ private:
   SDValue WidenVecRes_ADDRSPACECAST(SDNode *N);
   SDValue WidenVecRes_AssertZext(SDNode* N);
   SDValue WidenVecRes_BITCAST(SDNode* N);
+  SDValue WidenVecRes_LOOP_DEPENDENCE_MASK(SDNode *N);
   SDValue WidenVecRes_BUILD_VECTOR(SDNode* N);
   SDValue WidenVecRes_CONCAT_VECTORS(SDNode* N);
   SDValue WidenVecRes_EXTEND_VECTOR_INREG(SDNode* N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 2ca98958fde0..8e423c4f83b3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -138,6 +138,7 @@ class VectorLegalizer {
   SDValue ExpandVP_FNEG(SDNode *Node);
   SDValue ExpandVP_FABS(SDNode *Node);
   SDValue ExpandVP_FCOPYSIGN(SDNode *Node);
+  SDValue ExpandLOOP_DEPENDENCE_MASK(SDNode *N);
   SDValue ExpandSELECT(SDNode *Node);
   std::pair<SDValue, SDValue> ExpandLoad(SDNode *N);
   SDValue ExpandStore(SDNode *N);
@@ -475,6 +476,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::VECTOR_COMPRESS:
   case ISD::SCMP:
   case ISD::UCMP:
+  case ISD::LOOP_DEPENDENCE_WAR_MASK:
+  case ISD::LOOP_DEPENDENCE_RAW_MASK:
     Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
     break;
   case ISD::SMULFIX:
@@ -1291,6 +1294,10 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
   case ISD::UCMP:
     Results.push_back(TLI.expandCMP(Node, DAG));
     return;
+  case ISD::LOOP_DEPENDENCE_WAR_MASK:
+  case ISD::LOOP_DEPENDENCE_RAW_MASK:
+    Results.push_back(ExpandLOOP_DEPENDENCE_MASK(Node));
+    return;
 
   case ISD::FADD:
   case ISD::FMUL:
@@ -1796,6 +1803,50 @@ SDValue VectorLegalizer::ExpandVP_FCOPYSIGN(SDNode *Node) {
   return DAG.getNode(ISD::BITCAST, DL, VT, CopiedSign);
 }
 
+SDValue VectorLegalizer::ExpandLOOP_DEPENDENCE_MASK(SDNode *N) {
+  SDLoc DL(N);
+  SDValue SourceValue = N->getOperand(0);
+  SDValue SinkValue = N->getOperand(1);
+  SDValue EltSize = N->getOperand(2);
+
+  bool IsReadAfterWrite = N->getOpcode() == ISD::LOOP_DEPENDENCE_RAW_MASK;
+  EVT VT = N->getValueType(0);
+  EVT PtrVT = SourceValue->getValueType(0);
+
+  SDValue Diff = DAG.getNode(ISD::SUB, DL, PtrVT, SinkValue, SourceValue);
+  if (IsReadAfterWrite)
+    Diff = DAG.getNode(ISD::ABS, DL, PtrVT, Diff);
+
+  Diff = DAG.getNode(ISD::SDIV, DL, PtrVT, Diff, EltSize);
+
+  // If the difference is positive then some elements may alias
+  EVT CmpVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
+                                     Diff.getValueType());
+  SDValue Zero = DAG.getTargetConstant(0, DL, PtrVT);
+  SDValue Cmp = DAG.getSetCC(DL, CmpVT, Diff, Zero,
+                             IsReadAfterWrite ? ISD::SETEQ : ISD::SETLE);
+
+  // Create the lane mask
+  EVT SplatVT = VT.changeElementType(PtrVT);
+  SDValue DiffSplat = DAG.getSplat(SplatVT, DL, Diff);
+  SDValue VectorStep = DAG.getStepVector(DL, SplatVT);
+  EVT MaskVT = VT.changeElementType(MVT::i1);
+  SDValue DiffMask =
+      DAG.getSetCC(DL, MaskVT, VectorStep, DiffSplat, ISD::CondCode::SETULT);
+
+  EVT EltVT = VT.getVectorElementType();
+  // Extend the diff setcc in case the intrinsic has been promoted to a vector
+  // type with elements larger than i1
+  if (EltVT.getScalarSizeInBits() > MaskVT.getScalarSizeInBits())
+    DiffMask = DAG.getNode(ISD::ANY_EXTEND, DL, VT, DiffMask);
+
+  // Splat the compare result then OR it with the lane mask
+  if (CmpVT.getScalarSizeInBits() < EltVT.getScalarSizeInBits())
+    Cmp = DAG.getNode(ISD::ZERO_EXTEND, DL, EltVT, Cmp);
+  SDValue Splat = DAG.getSplat(VT, DL, Cmp);
+  return DAG.getNode(ISD::OR, DL, VT, DiffMask, Splat);
+}
+
 void VectorLegalizer::ExpandFP_TO_UINT(SDNode *Node,
                                        SmallVectorImpl<SDValue> &Results) {
   // Attempt to expand using TargetLowering.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 10e3a5149a5d..118fd8418f78 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -53,6 +53,10 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
     report_fatal_error("Do not know how to scalarize the result of this "
                        "operator!\n");
 
+  case ISD::LOOP_DEPENDENCE_WAR_MASK:
+  case ISD::LOOP_DEPENDENCE_RAW_MASK:
+    R = ScalarizeVecRes_LOOP_DEPENDENCE_MASK(N);
+    break;
   case ISD::MERGE_VALUES:      R = ScalarizeVecRes_MERGE_VALUES(N, ResNo);break;
   case ISD::BITCAST:           R = ScalarizeVecRes_BITCAST(N); break;
   case ISD::BUILD_VECTOR:      R = ScalarizeVecRes_BUILD_VECTOR(N); break;
@@ -396,6 +400,22 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_MERGE_VALUES(SDNode *N,
   return GetScalarizedVector(Op);
 }
 
+SDValue DAGTypeLegalizer::ScalarizeVecRes_LOOP_DEPENDENCE_MASK(SDNode *N) {
+  SDValue SourceValue = N->getOperand(0);
+  SDValue SinkValue = N->getOperand(1);
+  SDValue EltSize = N->getOperand(2);
+  EVT PtrVT = SourceValue->getValueType(0);
+  SDLoc DL(N);
+
+  SDValue Diff = DAG.getNode(ISD::SUB, DL, PtrVT, SinkValue, SourceValue);
+  EVT CmpVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
+                                     Diff.getValueType());
+  SDValue Zero = DAG.getTargetConstant(0, DL, PtrVT);
+  return DAG.getNode(ISD::OR, DL, CmpVT,
+                     DAG.getSetCC(DL, CmpVT, Diff, EltSize, ISD::SETGE),
+                     DAG.getSetCC(DL, CmpVT, Diff, Zero, ISD::SETEQ));
+}
+
 SDValue DAGTypeLegalizer::ScalarizeVecRes_BITCAST(SDNode *N) {
   SDValue Op = N->getOperand(0);
   if (getTypeAction(Op.getValueType()) == TargetLowering::TypeScalarizeVector)
@@ -1159,6 +1179,10 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
     report_fatal_error("Do not know how to split the result of this "
                        "operator!\n");
 
+  case ISD::LOOP_DEPENDENCE_RAW_MASK:
+  case ISD::LOOP_DEPENDENCE_WAR_MASK:
+    SplitVecRes_LOOP_DEPENDENCE_MASK(N, Lo, Hi);
+    break;
   case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break;
   case ISD::AssertZext:   SplitVecRes_AssertZext(N, Lo, Hi); break;
   case ISD::VSELECT:
@@ -1652,6 +1676,25 @@ void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo,
   Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);
 }
 
+void DAGTypeLegalizer::SplitVecRes_LOOP_DEPENDENCE_MASK(SDNode *N, SDValue &Lo,
+                                                        SDValue &Hi) {
+  SDLoc DL(N);
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+  SDValue PtrA = N->getOperand(0);
+  SDValue PtrB = N->getOperand(1);
+  Lo = DAG.getNode(N->getOpcode(), DL, LoVT, PtrA, PtrB, N->getOperand(2));
+
+  unsigned EltSize = N->getConstantOperandVal(2);
+  unsigned Offset = EltSize * HiVT.getVectorMinNumElements();
+  SDValue Addend = HiVT.isScalableVT()
+                       ? DAG.getVScale(DL, MVT::i64, APInt(64, Offset))
+                       : DAG.getConstant(Offset, DL, MVT::i64);
+
+  PtrA = DAG.getNode(ISD::ADD, DL, MVT::i64, PtrA, Addend);
+  Hi = DAG.getNode(N->getOpcode(), DL, HiVT, PtrA, PtrB, N->getOperand(2));
+}
+
 void DAGTypeLegalizer::SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo,
                                                 SDValue &Hi) {
   EVT LoVT, HiVT;
@@ -2517,10 +2560,10 @@ void DAGTypeLegalizer::SplitVecRes_Gather(MemSDNode *N, SDValue &Lo,
   else
     std::tie(IndexLo, IndexHi) = DAG.SplitVector(Ops.Index, dl);
 
+  MachineMemOperand::Flags MMOFlags = N->getMemOperand()->getFlags();
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
-      N->getPointerInfo(), MachineMemOperand::MOLoad,
-      LocationSize::beforeOrAfterPointer(), Alignment, N->getAAInfo(),
-      N->getRanges());
+      N->getPointerInfo(), MMOFlags, LocationSize::beforeOrAfterPointer(),
+      Alignment, N->getAAInfo(), N->getRanges());
 
   if (auto *MGT = dyn_cast<MaskedGatherSDNode>(N)) {
     SDValue PassThru = MGT->getPassThru();
@@ -4321,10 +4364,10 @@ SDValue DAGTypeLegalizer::SplitVecOp_Scatter(MemSDNode *N, unsigned OpNo) {
     std::tie(IndexLo, IndexHi) = DAG.SplitVector(Ops.Index, DL);
 
   SDValue Lo;
+  MachineMemOperand::Flags MMOFlags = N->getMemOperand()->getFlags();
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
-      N->getPointerInfo(), MachineMemOperand::MOStore,
-      LocationSize::beforeOrAfterPointer(), Alignment, N->getAAInfo(),
-      N->getRanges());
+      N->getPointerInfo(), MMOFlags, LocationSize::beforeOrAfterPointer(),
+      Alignment, N->getAAInfo(), N->getRanges());
 
   if (auto *MSC = dyn_cast<MaskedScatterSDNode>(N)) {
     SDValue OpsLo[] = {Ch, DataLo, MaskLo, Ptr, IndexLo, Ops.Scale};
@@ -4784,6 +4827,10 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
 #endif
     report_fatal_error("Do not know how to widen the result of this operator!");
 
+  case ISD::LOOP_DEPENDENCE_RAW_MASK:
+  case ISD::LOOP_DEPENDENCE_WAR_MASK:
+    Res = WidenVecRes_LOOP_DEPENDENCE_MASK(N);
+    break;
   case ISD::MERGE_VALUES:      Res = WidenVecRes_MERGE_VALUES(N, ResNo); break;
   case ISD::ADDRSPACECAST:
     Res = WidenVecRes_ADDRSPACECAST(N);
@@ -5986,6 +6033,13 @@ SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) {
   return CreateStackStoreLoad(InOp, WidenVT);
 }
 
+SDValue DAGTypeLegalizer::WidenVecRes_LOOP_DEPENDENCE_MASK(SDNode *N) {
+  return DAG.getNode(
+      N->getOpcode(), SDLoc(N),
+      TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)),
+      N->getOperand(0), N->getOperand(1), N->getOperand(2));
+}
+
 SDValue DAGTypeLegalizer::WidenVecRes_BUILD_VECTOR(SDNode *N) {
   SDLoc dl(N);
   // Build a vector with undefined for the new nodes.
diff --git a/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
index 0a449fd011e6..72ea0898f975 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
@@ -63,6 +63,8 @@ ResourcePriorityQueue::ResourcePriorityQueue(SelectionDAGISel *IS)
   HorizontalVerticalBalance = 0;
 }
 
+ResourcePriorityQueue::~ResourcePriorityQueue() = default;
+
 unsigned
 ResourcePriorityQueue::numberRCValPredInSU(SUnit *SU, unsigned RCId) {
   unsigned NumberDeps = 0;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 3672a91e33a3..bcf25958d098 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3299,7 +3299,7 @@ SelectionDAG::getValidShiftAmountRange(SDValue V, const APInt &DemandedElts,
   return std::nullopt;
 }
 
-std::optional<uint64_t>
+std::optional<unsigned>
 SelectionDAG::getValidShiftAmount(SDValue V, const APInt &DemandedElts,
                                   unsigned Depth) const {
   assert((V.getOpcode() == ISD::SHL || V.getOpcode() == ISD::SRL ||
@@ -3312,7 +3312,7 @@ SelectionDAG::getValidShiftAmount(SDValue V, const APInt &DemandedElts,
   return std::nullopt;
 }
 
-std::optional<uint64_t>
+std::optional<unsigned>
 SelectionDAG::getValidShiftAmount(SDValue V, unsigned Depth) const {
   EVT VT = V.getValueType();
   APInt DemandedElts = VT.isFixedLengthVector()
@@ -3321,7 +3321,7 @@ SelectionDAG::getValidShiftAmount(SDValue V, unsigned Depth) const {
   return getValidShiftAmount(V, DemandedElts, Depth);
 }
 
-std::optional<uint64_t>
+std::optional<unsigned>
 SelectionDAG::getValidMinimumShiftAmount(SDValue V, const APInt &DemandedElts,
                                          unsigned Depth) const {
   assert((V.getOpcode() == ISD::SHL || V.getOpcode() == ISD::SRL ||
@@ -3333,7 +3333,7 @@ SelectionDAG::getValidMinimumShiftAmount(SDValue V, const APInt &DemandedElts,
   return std::nullopt;
 }
 
-std::optional<uint64_t>
+std::optional<unsigned>
 SelectionDAG::getValidMinimumShiftAmount(SDValue V, unsigned Depth) const {
   EVT VT = V.getValueType();
   APInt DemandedElts = VT.isFixedLengthVector()
@@ -3342,7 +3342,7 @@ SelectionDAG::getValidMinimumShiftAmount(SDValue V, unsigned Depth) const {
   return getValidMinimumShiftAmount(V, DemandedElts, Depth);
 }
 
-std::optional<uint64_t>
+std::optional<unsigned>
 SelectionDAG::getValidMaximumShiftAmount(SDValue V, const APInt &DemandedElts,
                                          unsigned Depth) const {
   assert((V.getOpcode() == ISD::SHL || V.getOpcode() == ISD::SRL ||
@@ -3354,7 +3354,7 @@ SelectionDAG::getValidMaximumShiftAmount(SDValue V, const APInt &DemandedElts,
   return std::nullopt;
 }
 
-std::optional<uint64_t>
+std::optional<unsigned>
 SelectionDAG::getValidMaximumShiftAmount(SDValue V, unsigned Depth) const {
   EVT VT = V.getValueType();
   APInt DemandedElts = VT.isFixedLengthVector()
@@ -3828,7 +3828,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     Known = KnownBits::shl(Known, Known2, NUW, NSW, ShAmtNonZero);
 
     // Minimum shift low bits are known zero.
-    if (std::optional<uint64_t> ShMinAmt =
+    if (std::optional<unsigned> ShMinAmt =
             getValidMinimumShiftAmount(Op, DemandedElts, Depth + 1))
       Known.Zero.setLowBits(*ShMinAmt);
     break;
@@ -3840,7 +3840,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
                             Op->getFlags().hasExact());
 
     // Minimum shift high bits are known zero.
-    if (std::optional<uint64_t> ShMinAmt =
+    if (std::optional<unsigned> ShMinAmt =
             getValidMinimumShiftAmount(Op, DemandedElts, Depth + 1))
       Known.Zero.setHighBits(*ShMinAmt);
     break;
@@ -3850,6 +3850,22 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     Known = KnownBits::ashr(Known, Known2, /*ShAmtNonZero=*/false,
                             Op->getFlags().hasExact());
     break;
+  case ISD::ROTL:
+  case ISD::ROTR:
+    if (ConstantSDNode *C =
+            isConstOrConstSplat(Op.getOperand(1), DemandedElts)) {
+      unsigned Amt = C->getAPIntValue().urem(BitWidth);
+
+      Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+
+      // Canonicalize to ROTR.
+      if (Opcode == ISD::ROTL && Amt != 0)
+        Amt = BitWidth - Amt;
+
+      Known.Zero = Known.Zero.rotr(Amt);
+      Known.One = Known.One.rotr(Amt);
+    }
+    break;
   case ISD::FSHL:
   case ISD::FSHR:
     if (ConstantSDNode *C = isConstOrConstSplat(Op.getOperand(2), DemandedElts)) {
@@ -3868,15 +3884,11 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
       Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
       Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
       if (Opcode == ISD::FSHL) {
-        Known.One <<= Amt;
-        Known.Zero <<= Amt;
-        Known2.One.lshrInPlace(BitWidth - Amt);
-        Known2.Zero.lshrInPlace(BitWidth - Amt);
+        Known <<= Amt;
+        Known2 >>= BitWidth - Amt;
       } else {
-        Known.One <<= BitWidth - Amt;
-        Known.Zero <<= BitWidth - Amt;
-        Known2.One.lshrInPlace(Amt);
-        Known2.Zero.lshrInPlace(Amt);
+        Known <<= BitWidth - Amt;
+        Known2 >>= Amt;
       }
       Known = Known.unionWith(Known2);
     }
@@ -4875,15 +4887,15 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
   case ISD::SRA:
     Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
     // SRA X, C -> adds C sign bits.
-    if (std::optional<uint64_t> ShAmt =
+    if (std::optional<unsigned> ShAmt =
             getValidMinimumShiftAmount(Op, DemandedElts, Depth + 1))
-      Tmp = std::min<uint64_t>(Tmp + *ShAmt, VTBits);
+      Tmp = std::min(Tmp + *ShAmt, VTBits);
     return Tmp;
   case ISD::SHL:
     if (std::optional<ConstantRange> ShAmtRange =
             getValidShiftAmountRange(Op, DemandedElts, Depth + 1)) {
-      uint64_t MaxShAmt = ShAmtRange->getUnsignedMax().getZExtValue();
-      uint64_t MinShAmt = ShAmtRange->getUnsignedMin().getZExtValue();
+      unsigned MaxShAmt = ShAmtRange->getUnsignedMax().getZExtValue();
+      unsigned MinShAmt = ShAmtRange->getUnsignedMin().getZExtValue();
       // Try to look through ZERO/SIGN/ANY_EXTEND. If all extended bits are
       // shifted out, then we can compute the number of sign bits for the
       // operand being extended. A future improvement could be to pass along the
@@ -4894,7 +4906,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
         EVT ExtVT = Ext.getValueType();
         SDValue Extendee = Ext.getOperand(0);
         EVT ExtendeeVT = Extendee.getValueType();
-        uint64_t SizeDifference =
+        unsigned SizeDifference =
             ExtVT.getScalarSizeInBits() - ExtendeeVT.getScalarSizeInBits();
         if (SizeDifference <= MinShAmt) {
           Tmp = SizeDifference +
@@ -5127,7 +5139,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
 
     // If the sign portion ends in our element the subtraction gives correct
     // result. Otherwise it gives either negative or > bitwidth result
-    return std::clamp(KnownSign - rIndex * BitWidth, 0, BitWidth);
+    return std::clamp(KnownSign - rIndex * BitWidth, 1, BitWidth);
   }
   case ISD::INSERT_VECTOR_ELT: {
     if (VT.isScalableVector())
@@ -5660,6 +5672,10 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
   case ISD::USUBSAT:
   case ISD::MULHU:
   case ISD::MULHS:
+  case ISD::AVGFLOORS:
+  case ISD::AVGFLOORU:
+  case ISD::AVGCEILS:
+  case ISD::AVGCEILU:
   case ISD::ABDU:
   case ISD::ABDS:
   case ISD::SMIN:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 62ba801f6992..430e47451fd4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7974,12 +7974,19 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   }
   case Intrinsic::amdgcn_call_whole_wave: {
     TargetLowering::ArgListTy Args;
+    bool isTailCall = I.isTailCall();
 
     // The first argument is the callee. Skip it when assembling the call args.
     for (unsigned Idx = 1; Idx < I.arg_size(); ++Idx) {
       TargetLowering::ArgListEntry Arg(getValue(I.getArgOperand(Idx)),
                                        I.getArgOperand(Idx)->getType());
       Arg.setAttributes(&I, Idx);
+
+      // If we have an explicit sret argument that is an Instruction, (i.e., it
+      // might point to function-local memory), we can't meaningfully tail-call.
+      if (Arg.IsSRet && isa<Instruction>(I.getArgOperand(Idx)))
+        isTailCall = false;
+
       Args.push_back(Arg);
     }
 
@@ -7994,7 +8001,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
         .setChain(getRoot())
         .setCallee(CallingConv::AMDGPU_Gfx_WholeWave, I.getType(),
                    getValue(I.getArgOperand(0)), std::move(Args))
-        .setTailCall(false)
+        .setTailCall(isTailCall && canTailCall(I))
         .setIsPreallocated(
             I.countOperandBundlesOfType(LLVMContext::OB_preallocated) != 0)
         .setConvergent(I.isConvergent())
@@ -8295,6 +8302,18 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     visitVectorExtractLastActive(I, Intrinsic);
     return;
   }
+  case Intrinsic::loop_dependence_war_mask:
+    setValue(&I,
+             DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, sdl,
+                         EVT::getEVT(I.getType()), getValue(I.getOperand(0)),
+                         getValue(I.getOperand(1)), getValue(I.getOperand(2))));
+    return;
+  case Intrinsic::loop_dependence_raw_mask:
+    setValue(&I,
+             DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, sdl,
+                         EVT::getEVT(I.getType()), getValue(I.getOperand(0)),
+                         getValue(I.getOperand(1)), getValue(I.getOperand(2))));
+    return;
   }
 }
 
@@ -8456,8 +8475,11 @@ void SelectionDAGBuilder::visitVPLoad(
   MemoryLocation ML = MemoryLocation::getAfter(PtrOperand, AAInfo);
   bool AddToChain = !BatchAA || !BatchAA->pointsToConstantMemory(ML);
   SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  MachineMemOperand::Flags MMOFlags =
+      TLI.getVPIntrinsicMemOperandFlags(VPIntrin);
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
-      MachinePointerInfo(PtrOperand), MachineMemOperand::MOLoad,
+      MachinePointerInfo(PtrOperand), MMOFlags,
       LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo, Ranges);
   LD = DAG.getLoadVP(VT, DL, InChain, OpValues[0], OpValues[1], OpValues[2],
                      MMO, false /*IsExpanding */);
@@ -8508,9 +8530,11 @@ void SelectionDAGBuilder::visitVPGather(
     Alignment = DAG.getEVTAlign(VT.getScalarType());
   unsigned AS =
     PtrOperand->getType()->getScalarType()->getPointerAddressSpace();
+  MachineMemOperand::Flags MMOFlags =
+      TLI.getVPIntrinsicMemOperandFlags(VPIntrin);
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
-      MachinePointerInfo(AS), MachineMemOperand::MOLoad,
-      LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo, Ranges);
+      MachinePointerInfo(AS), MMOFlags, LocationSize::beforeOrAfterPointer(),
+      *Alignment, AAInfo, Ranges);
   SDValue Base, Index, Scale;
   bool UniformBase =
       getUniformBase(PtrOperand, Base, Index, Scale, this, VPIntrin.getParent(),
@@ -8546,8 +8570,11 @@ void SelectionDAGBuilder::visitVPStore(
     Alignment = DAG.getEVTAlign(VT);
   SDValue Ptr = OpValues[1];
   SDValue Offset = DAG.getUNDEF(Ptr.getValueType());
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  MachineMemOperand::Flags MMOFlags =
+      TLI.getVPIntrinsicMemOperandFlags(VPIntrin);
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
-      MachinePointerInfo(PtrOperand), MachineMemOperand::MOStore,
+      MachinePointerInfo(PtrOperand), MMOFlags,
       LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo);
   ST = DAG.getStoreVP(getMemoryRoot(), DL, OpValues[0], Ptr, Offset,
                       OpValues[2], OpValues[3], VT, MMO, ISD::UNINDEXED,
@@ -8569,9 +8596,11 @@ void SelectionDAGBuilder::visitVPScatter(
     Alignment = DAG.getEVTAlign(VT.getScalarType());
   unsigned AS =
       PtrOperand->getType()->getScalarType()->getPointerAddressSpace();
+  MachineMemOperand::Flags MMOFlags =
+      TLI.getVPIntrinsicMemOperandFlags(VPIntrin);
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
-      MachinePointerInfo(AS), MachineMemOperand::MOStore,
-      LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo);
+      MachinePointerInfo(AS), MMOFlags, LocationSize::beforeOrAfterPointer(),
+      *Alignment, AAInfo);
   SDValue Base, Index, Scale;
   bool UniformBase =
       getUniformBase(PtrOperand, Base, Index, Scale, this, VPIntrin.getParent(),
@@ -8609,9 +8638,12 @@ void SelectionDAGBuilder::visitVPStridedLoad(
   bool AddToChain = !BatchAA || !BatchAA->pointsToConstantMemory(ML);
   SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode();
   unsigned AS = PtrOperand->getType()->getPointerAddressSpace();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  MachineMemOperand::Flags MMOFlags =
+      TLI.getVPIntrinsicMemOperandFlags(VPIntrin);
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
-      MachinePointerInfo(AS), MachineMemOperand::MOLoad,
-      LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo, Ranges);
+      MachinePointerInfo(AS), MMOFlags, LocationSize::beforeOrAfterPointer(),
+      *Alignment, AAInfo, Ranges);
 
   SDValue LD = DAG.getStridedLoadVP(VT, DL, InChain, OpValues[0], OpValues[1],
                                     OpValues[2], OpValues[3], MMO,
@@ -8632,9 +8664,12 @@ void SelectionDAGBuilder::visitVPStridedStore(
     Alignment = DAG.getEVTAlign(VT.getScalarType());
   AAMDNodes AAInfo = VPIntrin.getAAMetadata();
   unsigned AS = PtrOperand->getType()->getPointerAddressSpace();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  MachineMemOperand::Flags MMOFlags =
+      TLI.getVPIntrinsicMemOperandFlags(VPIntrin);
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
-      MachinePointerInfo(AS), MachineMemOperand::MOStore,
-      LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo);
+      MachinePointerInfo(AS), MMOFlags, LocationSize::beforeOrAfterPointer(),
+      *Alignment, AAInfo);
 
   SDValue ST = DAG.getStridedStoreVP(
       getMemoryRoot(), DL, OpValues[0], OpValues[1],
@@ -8901,6 +8936,29 @@ SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI,
   return Result;
 }
 
+bool SelectionDAGBuilder::canTailCall(const CallBase &CB) const {
+  bool isMustTailCall = CB.isMustTailCall();
+
+  // Avoid emitting tail calls in functions with the disable-tail-calls
+  // attribute.
+  const Function *Caller = CB.getParent()->getParent();
+  if (Caller->getFnAttribute("disable-tail-calls").getValueAsString() ==
+          "true" &&
+      !isMustTailCall)
+    return false;
+
+  // We can't tail call inside a function with a swifterror argument. Lowering
+  // does not support this yet. It would have to move into the swifterror
+  // register before the call.
+  if (DAG.getTargetLoweringInfo().supportSwiftError() &&
+      Caller->getAttributes().hasAttrSomewhere(Attribute::SwiftError))
+    return false;
+
+  // Check if target-independent constraints permit a tail call here.
+  // Target-dependent constraints are checked within TLI->LowerCallTo.
+  return isInTailCallPosition(CB, DAG.getTarget());
+}
+
 void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee,
                                       bool isTailCall, bool isMustTailCall,
                                       const BasicBlock *EHPadBB,
@@ -8915,21 +8973,8 @@ void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee,
   const Value *SwiftErrorVal = nullptr;
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
-  if (isTailCall) {
-    // Avoid emitting tail calls in functions with the disable-tail-calls
-    // attribute.
-    auto *Caller = CB.getParent()->getParent();
-    if (Caller->getFnAttribute("disable-tail-calls").getValueAsString() ==
-        "true" && !isMustTailCall)
-      isTailCall = false;
-
-    // We can't tail call inside a function with a swifterror argument. Lowering
-    // does not support this yet. It would have to move into the swifterror
-    // register before the call.
-    if (TLI.supportSwiftError() &&
-        Caller->getAttributes().hasAttrSomewhere(Attribute::SwiftError))
-      isTailCall = false;
-  }
+  if (isTailCall)
+    isTailCall = canTailCall(CB);
 
   for (auto I = CB.arg_begin(), E = CB.arg_end(); I != E; ++I) {
     const Value *V = *I;
@@ -8969,11 +9014,6 @@ void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee,
     Args.push_back(Entry);
   }
 
-  // Check if target-independent constraints permit a tail call here.
-  // Target-dependent constraints are checked within TLI->LowerCallTo.
-  if (isTailCall && !isInTailCallPosition(CB, DAG.getTarget()))
-    isTailCall = false;
-
   // Disable tail calls if there is an swifterror argument. Targets have not
   // been updated to support tail calls.
   if (TLI.supportSwiftError() && SwiftErrorVal)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index e0835e631035..c7577fa335fe 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -408,6 +408,10 @@ public:
                    bool IsMustTailCall, const BasicBlock *EHPadBB = nullptr,
                    const TargetLowering::PtrAuthInfo *PAI = nullptr);
 
+  // Check some of the target-independent constraints for tail calls. This does
+  // not iterate over the call arguments.
+  bool canTailCall(const CallBase &CB) const;
+
   // Lower range metadata from 0 to N to assert zext to an integer of nearest
   // floor power of two.
   SDValue lowerRangeToAssertZExt(SelectionDAG &DAG, const Instruction &I,
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 900da7645504..4b2a00c2e2cf 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -587,6 +587,10 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
     return "partial_reduce_smla";
   case ISD::PARTIAL_REDUCE_SUMLA:
     return "partial_reduce_sumla";
+  case ISD::LOOP_DEPENDENCE_WAR_MASK:
+    return "loop_dep_war";
+  case ISD::LOOP_DEPENDENCE_RAW_MASK:
+    return "loop_dep_raw";
 
     // Vector Predication
 #define BEGIN_REGISTER_VP_SDNODE(SDID, LEGALARG, NAME, ...)                    \
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index ece50ed95fc4..e61558c59bf0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -1729,10 +1729,18 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
     // Setup an EH landing-pad block.
     FuncInfo->ExceptionPointerVirtReg = Register();
     FuncInfo->ExceptionSelectorVirtReg = Register();
-    if (LLVMBB->isEHPad())
+    if (LLVMBB->isEHPad()) {
       if (!PrepareEHLandingPad())
         continue;
 
+      if (!FastIS) {
+        SDValue NewRoot = TLI->lowerEHPadEntry(CurDAG->getRoot(),
+                                               SDB->getCurSDLoc(), *CurDAG);
+        if (NewRoot && NewRoot != CurDAG->getRoot())
+          CurDAG->setRoot(NewRoot);
+      }
+    }
+
     // Before doing SelectionDAG ISel, see if FastISel has been requested.
     if (FastIS) {
       if (LLVMBB != &Fn.getEntryBlock())
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 402a012e8e55..fd6d20e146bb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -832,7 +832,7 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
   case ISD::SHL: {
     // If we are only demanding sign bits then we can use the shift source
     // directly.
-    if (std::optional<uint64_t> MaxSA =
+    if (std::optional<unsigned> MaxSA =
             DAG.getValidMaximumShiftAmount(Op, DemandedElts, Depth + 1)) {
       SDValue Op0 = Op.getOperand(0);
       unsigned ShAmt = *MaxSA;
@@ -847,7 +847,7 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
   case ISD::SRL: {
     // If we are only demanding sign bits then we can use the shift source
     // directly.
-    if (std::optional<uint64_t> MaxSA =
+    if (std::optional<unsigned> MaxSA =
             DAG.getValidMaximumShiftAmount(Op, DemandedElts, Depth + 1)) {
       SDValue Op0 = Op.getOperand(0);
       unsigned ShAmt = *MaxSA;
@@ -1780,7 +1780,7 @@ bool TargetLowering::SimplifyDemandedBits(
     SDValue Op1 = Op.getOperand(1);
     EVT ShiftVT = Op1.getValueType();
 
-    if (std::optional<uint64_t> KnownSA =
+    if (std::optional<unsigned> KnownSA =
             TLO.DAG.getValidShiftAmount(Op, DemandedElts, Depth + 1)) {
       unsigned ShAmt = *KnownSA;
       if (ShAmt == 0)
@@ -1792,7 +1792,7 @@ bool TargetLowering::SimplifyDemandedBits(
       // TODO - support non-uniform vector amounts.
       if (Op0.getOpcode() == ISD::SRL) {
         if (!DemandedBits.intersects(APInt::getLowBitsSet(BitWidth, ShAmt))) {
-          if (std::optional<uint64_t> InnerSA =
+          if (std::optional<unsigned> InnerSA =
                   TLO.DAG.getValidShiftAmount(Op0, DemandedElts, Depth + 2)) {
             unsigned C1 = *InnerSA;
             unsigned Opc = ISD::SHL;
@@ -1832,7 +1832,7 @@ bool TargetLowering::SimplifyDemandedBits(
         // TODO - support non-uniform vector amounts.
         if (InnerOp.getOpcode() == ISD::SRL && Op0.hasOneUse() &&
             InnerOp.hasOneUse()) {
-          if (std::optional<uint64_t> SA2 = TLO.DAG.getValidShiftAmount(
+          if (std::optional<unsigned> SA2 = TLO.DAG.getValidShiftAmount(
                   InnerOp, DemandedElts, Depth + 2)) {
             unsigned InnerShAmt = *SA2;
             if (InnerShAmt < ShAmt && InnerShAmt < InnerBits &&
@@ -1858,8 +1858,7 @@ bool TargetLowering::SimplifyDemandedBits(
         Op->dropFlags(SDNodeFlags::NoWrap);
         return true;
       }
-      Known.Zero <<= ShAmt;
-      Known.One <<= ShAmt;
+      Known <<= ShAmt;
       // low bits known zero.
       Known.Zero.setLowBits(ShAmt);
 
@@ -1950,7 +1949,7 @@ bool TargetLowering::SimplifyDemandedBits(
 
     // If we are only demanding sign bits then we can use the shift source
     // directly.
-    if (std::optional<uint64_t> MaxSA =
+    if (std::optional<unsigned> MaxSA =
             TLO.DAG.getValidMaximumShiftAmount(Op, DemandedElts, Depth + 1)) {
       unsigned ShAmt = *MaxSA;
       unsigned NumSignBits =
@@ -1966,7 +1965,7 @@ bool TargetLowering::SimplifyDemandedBits(
     SDValue Op1 = Op.getOperand(1);
     EVT ShiftVT = Op1.getValueType();
 
-    if (std::optional<uint64_t> KnownSA =
+    if (std::optional<unsigned> KnownSA =
             TLO.DAG.getValidShiftAmount(Op, DemandedElts, Depth + 1)) {
       unsigned ShAmt = *KnownSA;
       if (ShAmt == 0)
@@ -1978,7 +1977,7 @@ bool TargetLowering::SimplifyDemandedBits(
       // TODO - support non-uniform vector amounts.
       if (Op0.getOpcode() == ISD::SHL) {
         if (!DemandedBits.intersects(APInt::getHighBitsSet(BitWidth, ShAmt))) {
-          if (std::optional<uint64_t> InnerSA =
+          if (std::optional<unsigned> InnerSA =
                   TLO.DAG.getValidShiftAmount(Op0, DemandedElts, Depth + 2)) {
             unsigned C1 = *InnerSA;
             unsigned Opc = ISD::SRL;
@@ -1998,7 +1997,7 @@ bool TargetLowering::SimplifyDemandedBits(
       // single sra. We can do this if the top bits are never demanded.
       if (Op0.getOpcode() == ISD::SRA && Op0.hasOneUse()) {
         if (!DemandedBits.intersects(APInt::getHighBitsSet(BitWidth, ShAmt))) {
-          if (std::optional<uint64_t> InnerSA =
+          if (std::optional<unsigned> InnerSA =
                   TLO.DAG.getValidShiftAmount(Op0, DemandedElts, Depth + 2)) {
             unsigned C1 = *InnerSA;
             // Clamp the combined shift amount if it exceeds the bit width.
@@ -2042,8 +2041,7 @@ bool TargetLowering::SimplifyDemandedBits(
       if (SimplifyDemandedBits(Op0, InDemandedMask, DemandedElts, Known, TLO,
                                Depth + 1))
         return true;
-      Known.Zero.lshrInPlace(ShAmt);
-      Known.One.lshrInPlace(ShAmt);
+      Known >>= ShAmt;
       // High bits known zero.
       Known.Zero.setHighBits(ShAmt);
 
@@ -2064,7 +2062,7 @@ bool TargetLowering::SimplifyDemandedBits(
 
     // If we are only demanding sign bits then we can use the shift source
     // directly.
-    if (std::optional<uint64_t> MaxSA =
+    if (std::optional<unsigned> MaxSA =
             TLO.DAG.getValidMaximumShiftAmount(Op, DemandedElts, Depth + 1)) {
       unsigned ShAmt = *MaxSA;
       // Must already be signbits in DemandedBits bounds, and can't demand any
@@ -2103,7 +2101,7 @@ bool TargetLowering::SimplifyDemandedBits(
     if (DemandedBits.isOne())
       return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT, Op0, Op1));
 
-    if (std::optional<uint64_t> KnownSA =
+    if (std::optional<unsigned> KnownSA =
             TLO.DAG.getValidShiftAmount(Op, DemandedElts, Depth + 1)) {
       unsigned ShAmt = *KnownSA;
       if (ShAmt == 0)
@@ -2112,7 +2110,7 @@ bool TargetLowering::SimplifyDemandedBits(
       // fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target
       // supports sext_inreg.
       if (Op0.getOpcode() == ISD::SHL) {
-        if (std::optional<uint64_t> InnerSA =
+        if (std::optional<unsigned> InnerSA =
                 TLO.DAG.getValidShiftAmount(Op0, DemandedElts, Depth + 2)) {
           unsigned LowBits = BitWidth - ShAmt;
           EVT ExtVT = EVT::getIntegerVT(*TLO.DAG.getContext(), LowBits);
@@ -2153,8 +2151,7 @@ bool TargetLowering::SimplifyDemandedBits(
       if (SimplifyDemandedBits(Op0, InDemandedMask, DemandedElts, Known, TLO,
                                Depth + 1))
         return true;
-      Known.Zero.lshrInPlace(ShAmt);
-      Known.One.lshrInPlace(ShAmt);
+      Known >>= ShAmt;
 
       // If the input sign bit is known to be zero, or if none of the top bits
       // are demanded, turn this into an unsigned shift right.
@@ -2225,10 +2222,8 @@ bool TargetLowering::SimplifyDemandedBits(
                                Depth + 1))
         return true;
 
-      Known2.One <<= (IsFSHL ? Amt : (BitWidth - Amt));
-      Known2.Zero <<= (IsFSHL ? Amt : (BitWidth - Amt));
-      Known.One.lshrInPlace(IsFSHL ? (BitWidth - Amt) : Amt);
-      Known.Zero.lshrInPlace(IsFSHL ? (BitWidth - Amt) : Amt);
+      Known2 <<= (IsFSHL ? Amt : (BitWidth - Amt));
+      Known >>= (IsFSHL ? (BitWidth - Amt) : Amt);
       Known = Known.unionWith(Known2);
 
       // Attempt to avoid multi-use ops if we don't need anything from them.
@@ -2363,8 +2358,7 @@ bool TargetLowering::SimplifyDemandedBits(
     if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, Known2, TLO,
                              Depth + 1))
       return true;
-    Known.One = Known2.One.reverseBits();
-    Known.Zero = Known2.Zero.reverseBits();
+    Known = Known2.reverseBits();
     break;
   }
   case ISD::BSWAP: {
@@ -2397,8 +2391,7 @@ bool TargetLowering::SimplifyDemandedBits(
     if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, Known2, TLO,
                              Depth + 1))
       return true;
-    Known.One = Known2.One.byteSwap();
-    Known.Zero = Known2.Zero.byteSwap();
+    Known = Known2.byteSwap();
     break;
   }
   case ISD::CTPOP: {
@@ -2664,11 +2657,11 @@ bool TargetLowering::SimplifyDemandedBits(
           break;
         }
 
-        std::optional<uint64_t> ShAmtC =
+        std::optional<unsigned> ShAmtC =
             TLO.DAG.getValidShiftAmount(Src, DemandedElts, Depth + 2);
         if (!ShAmtC || *ShAmtC >= BitWidth)
           break;
-        uint64_t ShVal = *ShAmtC;
+        unsigned ShVal = *ShAmtC;
 
         APInt HighBits =
             APInt::getHighBitsSet(OperandBitWidth, OperandBitWidth - BitWidth);
@@ -3234,27 +3227,6 @@ bool TargetLowering::SimplifyDemandedVectorElts(
       KnownUndef.setAllBits();
       return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
     }
-    SDValue ScalarSrc = Op.getOperand(0);
-    if (ScalarSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
-      SDValue Src = ScalarSrc.getOperand(0);
-      SDValue Idx = ScalarSrc.getOperand(1);
-      EVT SrcVT = Src.getValueType();
-
-      ElementCount SrcEltCnt = SrcVT.getVectorElementCount();
-
-      if (SrcEltCnt.isScalable())
-        return false;
-
-      unsigned NumSrcElts = SrcEltCnt.getFixedValue();
-      if (isNullConstant(Idx)) {
-        APInt SrcDemandedElts = APInt::getOneBitSet(NumSrcElts, 0);
-        APInt SrcUndef = KnownUndef.zextOrTrunc(NumSrcElts);
-        APInt SrcZero = KnownZero.zextOrTrunc(NumSrcElts);
-        if (SimplifyDemandedVectorElts(Src, SrcDemandedElts, SrcUndef, SrcZero,
-                                       TLO, Depth + 1))
-          return true;
-      }
-    }
     KnownUndef.setHighBits(NumElts - 1);
     break;
   }
@@ -9740,8 +9712,8 @@ SDValue TargetLowering::expandABS(SDNode *N, SelectionDAG &DAG,
 SDValue TargetLowering::expandABD(SDNode *N, SelectionDAG &DAG) const {
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
-  SDValue LHS = DAG.getFreeze(N->getOperand(0));
-  SDValue RHS = DAG.getFreeze(N->getOperand(1));
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
   bool IsSigned = N->getOpcode() == ISD::ABDS;
 
   // abds(lhs, rhs) -> sub(smax(lhs,rhs), smin(lhs,rhs))
@@ -9749,34 +9721,37 @@ SDValue TargetLowering::expandABD(SDNode *N, SelectionDAG &DAG) const {
   unsigned MaxOpc = IsSigned ? ISD::SMAX : ISD::UMAX;
   unsigned MinOpc = IsSigned ? ISD::SMIN : ISD::UMIN;
   if (isOperationLegal(MaxOpc, VT) && isOperationLegal(MinOpc, VT)) {
+    LHS = DAG.getFreeze(LHS);
+    RHS = DAG.getFreeze(RHS);
     SDValue Max = DAG.getNode(MaxOpc, dl, VT, LHS, RHS);
     SDValue Min = DAG.getNode(MinOpc, dl, VT, LHS, RHS);
     return DAG.getNode(ISD::SUB, dl, VT, Max, Min);
   }
 
   // abdu(lhs, rhs) -> or(usubsat(lhs,rhs), usubsat(rhs,lhs))
-  if (!IsSigned && isOperationLegal(ISD::USUBSAT, VT))
+  if (!IsSigned && isOperationLegal(ISD::USUBSAT, VT)) {
+    LHS = DAG.getFreeze(LHS);
+    RHS = DAG.getFreeze(RHS);
     return DAG.getNode(ISD::OR, dl, VT,
                        DAG.getNode(ISD::USUBSAT, dl, VT, LHS, RHS),
                        DAG.getNode(ISD::USUBSAT, dl, VT, RHS, LHS));
+  }
 
   // If the subtract doesn't overflow then just use abs(sub())
-  // NOTE: don't use frozen operands for value tracking.
-  bool IsNonNegative = DAG.SignBitIsZero(N->getOperand(1)) &&
-                       DAG.SignBitIsZero(N->getOperand(0));
+  bool IsNonNegative = DAG.SignBitIsZero(LHS) && DAG.SignBitIsZero(RHS);
 
-  if (DAG.willNotOverflowSub(IsSigned || IsNonNegative, N->getOperand(0),
-                             N->getOperand(1)))
+  if (DAG.willNotOverflowSub(IsSigned || IsNonNegative, LHS, RHS))
     return DAG.getNode(ISD::ABS, dl, VT,
                        DAG.getNode(ISD::SUB, dl, VT, LHS, RHS));
 
-  if (DAG.willNotOverflowSub(IsSigned || IsNonNegative, N->getOperand(1),
-                             N->getOperand(0)))
+  if (DAG.willNotOverflowSub(IsSigned || IsNonNegative, RHS, LHS))
     return DAG.getNode(ISD::ABS, dl, VT,
                        DAG.getNode(ISD::SUB, dl, VT, RHS, LHS));
 
   EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
   ISD::CondCode CC = IsSigned ? ISD::CondCode::SETGT : ISD::CondCode::SETUGT;
+  LHS = DAG.getFreeze(LHS);
+  RHS = DAG.getFreeze(RHS);
   SDValue Cmp = DAG.getSetCC(dl, CCVT, LHS, RHS, CC);
 
   // Branchless expansion iff cmp result is allbits:
diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index 9e49dddd46ba..0d7b128fc736 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -996,7 +996,7 @@ static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
       (MI->getOpcode() != CombineOpc && CombineOpc != 0))
     return false;
   // Must only used by the user we combine with.
-  if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
+  if (!MRI.hasOneNonDBGUse(MO.getReg()))
     return false;
 
   return true;
@@ -1456,11 +1456,13 @@ void TargetInstrInfo::reassociateOps(
   MIB1->clearFlag(MachineInstr::MIFlag::NoSWrap);
   MIB1->clearFlag(MachineInstr::MIFlag::NoUWrap);
   MIB1->clearFlag(MachineInstr::MIFlag::IsExact);
+  MIB1->clearFlag(MachineInstr::MIFlag::Disjoint);
 
   MIB2->setFlags(IntersectedFlags);
   MIB2->clearFlag(MachineInstr::MIFlag::NoSWrap);
   MIB2->clearFlag(MachineInstr::MIFlag::NoUWrap);
   MIB2->clearFlag(MachineInstr::MIFlag::IsExact);
+  MIB2->clearFlag(MachineInstr::MIFlag::Disjoint);
 
   setSpecialOperandAttr(Root, Prev, *MIB1, *MIB2);
 
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 9ffced80b07f..c23281a820b2 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -612,23 +612,23 @@ RTLIB::Libcall RTLIB::getMEMSET_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize) {
 ISD::CondCode TargetLoweringBase::getSoftFloatCmpLibcallPredicate(
     RTLIB::LibcallImpl Impl) const {
   switch (Impl) {
-  case RTLIB::__aeabi_dcmpeq__une:
-  case RTLIB::__aeabi_fcmpeq__une:
+  case RTLIB::impl___aeabi_dcmpeq__une:
+  case RTLIB::impl___aeabi_fcmpeq__une:
     // Usage in the eq case, so we have to invert the comparison.
     return ISD::SETEQ;
-  case RTLIB::__aeabi_dcmpeq__oeq:
-  case RTLIB::__aeabi_fcmpeq__oeq:
+  case RTLIB::impl___aeabi_dcmpeq__oeq:
+  case RTLIB::impl___aeabi_fcmpeq__oeq:
     // Normal comparison to boolean value.
     return ISD::SETNE;
-  case RTLIB::__aeabi_dcmplt:
-  case RTLIB::__aeabi_dcmple:
-  case RTLIB::__aeabi_dcmpge:
-  case RTLIB::__aeabi_dcmpgt:
-  case RTLIB::__aeabi_dcmpun:
-  case RTLIB::__aeabi_fcmplt:
-  case RTLIB::__aeabi_fcmple:
-  case RTLIB::__aeabi_fcmpge:
-  case RTLIB::__aeabi_fcmpgt:
+  case RTLIB::impl___aeabi_dcmplt:
+  case RTLIB::impl___aeabi_dcmple:
+  case RTLIB::impl___aeabi_dcmpge:
+  case RTLIB::impl___aeabi_dcmpgt:
+  case RTLIB::impl___aeabi_dcmpun:
+  case RTLIB::impl___aeabi_fcmplt:
+  case RTLIB::impl___aeabi_fcmple:
+  case RTLIB::impl___aeabi_fcmpge:
+  case RTLIB::impl___aeabi_fcmpgt:
     /// The AEABI versions return a typical boolean value, so we can compare
     /// against the integer result as simply != 0.
     return ISD::SETNE;
@@ -900,6 +900,9 @@ void TargetLoweringBase::initActions() {
     // Masked vector extracts default to expand.
     setOperationAction(ISD::VECTOR_FIND_LAST_ACTIVE, VT, Expand);
 
+    setOperationAction(ISD::LOOP_DEPENDENCE_RAW_MASK, VT, Expand);
+    setOperationAction(ISD::LOOP_DEPENDENCE_WAR_MASK, VT, Expand);
+
     // FP environment operations default to expand.
     setOperationAction(ISD::GET_FPENV, VT, Expand);
     setOperationAction(ISD::SET_FPENV, VT, Expand);
@@ -2406,6 +2409,34 @@ TargetLoweringBase::getAtomicMemOperandFlags(const Instruction &AI,
   return Flags;
 }
 
+MachineMemOperand::Flags TargetLoweringBase::getVPIntrinsicMemOperandFlags(
+    const VPIntrinsic &VPIntrin) const {
+  MachineMemOperand::Flags Flags = MachineMemOperand::MONone;
+  Intrinsic::ID IntrinID = VPIntrin.getIntrinsicID();
+
+  switch (IntrinID) {
+  default:
+    llvm_unreachable("unexpected intrinsic. Existing code may be appropriate "
+                     "for it, but support must be explicitly enabled");
+  case Intrinsic::vp_load:
+  case Intrinsic::vp_gather:
+  case Intrinsic::experimental_vp_strided_load:
+    Flags = MachineMemOperand::MOLoad;
+    break;
+  case Intrinsic::vp_store:
+  case Intrinsic::vp_scatter:
+  case Intrinsic::experimental_vp_strided_store:
+    Flags = MachineMemOperand::MOStore;
+    break;
+  }
+
+  if (VPIntrin.hasMetadata(LLVMContext::MD_nontemporal))
+    Flags |= MachineMemOperand::MONonTemporal;
+
+  Flags |= getTargetMMOFlags(VPIntrin);
+  return Flags;
+}
+
 Instruction *TargetLoweringBase::emitLeadingFence(IRBuilderBase &Builder,
                                                   Instruction *Inst,
                                                   AtomicOrdering Ord) const {
diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index d19ef923ef74..ae681b9aebdf 100644
--- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -247,6 +247,8 @@ void TargetLoweringObjectFileELF::Initialize(MCContext &Ctx,
     break;
   case Triple::riscv32:
   case Triple::riscv64:
+  case Triple::riscv32be:
+  case Triple::riscv64be:
     LSDAEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
     PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
                           dwarf::DW_EH_PE_sdata4;
@@ -1918,6 +1920,13 @@ void TargetLoweringObjectFileCOFF::emitModuleMetadata(MCStreamer &Streamer,
   }
 
   emitCGProfileMetadata(Streamer, M);
+  emitPseudoProbeDescMetadata(Streamer, M, [](MCStreamer &Streamer) {
+    if (MCSymbol *Sym =
+            static_cast<MCSectionCOFF *>(Streamer.getCurrentSectionOnly())
+                ->getCOMDATSymbol())
+      if (Sym->isUndefined())
+        Streamer.emitLabel(Sym);
+  });
 }
 
 void TargetLoweringObjectFileCOFF::emitLinkerDirectives(
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index 7d7c6e743fa7..b6169e6c4dc3 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -134,12 +134,18 @@ static cl::opt<cl::boolOrDefault> DebugifyCheckAndStripAll(
 static cl::opt<RunOutliner> EnableMachineOutliner(
     "enable-machine-outliner", cl::desc("Enable the machine outliner"),
     cl::Hidden, cl::ValueOptional, cl::init(RunOutliner::TargetDefault),
-    cl::values(clEnumValN(RunOutliner::AlwaysOutline, "always",
-                          "Run on all functions guaranteed to be beneficial"),
-               clEnumValN(RunOutliner::NeverOutline, "never",
-                          "Disable all outlining"),
-               // Sentinel value for unspecified option.
-               clEnumValN(RunOutliner::AlwaysOutline, "", "")));
+    cl::values(
+        clEnumValN(RunOutliner::AlwaysOutline, "always",
+                   "Run on all functions guaranteed to be beneficial"),
+        clEnumValN(RunOutliner::OptimisticPGO, "optimistic-pgo",
+                   "Outline cold code only. If a code block does not have "
+                   "profile data, optimistically assume it is cold."),
+        clEnumValN(RunOutliner::ConservativePGO, "conservative-pgo",
+                   "Outline cold code only. If a code block does not have "
+                   "profile, data, conservatively assume it is hot."),
+        clEnumValN(RunOutliner::NeverOutline, "never", "Disable all outlining"),
+        // Sentinel value for unspecified option.
+        clEnumValN(RunOutliner::AlwaysOutline, "", "")));
 static cl::opt<bool> EnableGlobalMergeFunc(
     "enable-global-merge-func", cl::Hidden,
     cl::desc("Enable global merge functions that are based on hash function"));
@@ -1074,7 +1080,7 @@ bool TargetPassConfig::addISelPasses() {
   PM->add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis()));
   addPass(createPreISelIntrinsicLoweringPass());
   addPass(createExpandLargeDivRemPass());
-  addPass(createExpandFpPass());
+  addPass(createExpandFpPass(getOptLevel()));
   addIRPasses();
   addCodeGenPrepare();
   addPassesToHandleExceptions();
@@ -1224,12 +1230,9 @@ void TargetPassConfig::addMachinePasses() {
   if (TM->Options.EnableMachineOutliner &&
       getOptLevel() != CodeGenOptLevel::None &&
       EnableMachineOutliner != RunOutliner::NeverOutline) {
-    bool RunOnAllFunctions =
-        (EnableMachineOutliner == RunOutliner::AlwaysOutline);
-    bool AddOutliner =
-        RunOnAllFunctions || TM->Options.SupportsDefaultOutlining;
-    if (AddOutliner)
-      addPass(createMachineOutlinerPass(RunOnAllFunctions));
+    if (EnableMachineOutliner != RunOutliner::TargetDefault ||
+        TM->Options.SupportsDefaultOutlining)
+      addPass(createMachineOutlinerPass(EnableMachineOutliner));
   }
 
   if (GCEmptyBlocks)
diff --git a/llvm/lib/DWARFLinker/Classic/DWARFStreamer.cpp b/llvm/lib/DWARFLinker/Classic/DWARFStreamer.cpp
index 4e4d86e5cb8d..1c0ddc8e1ca3 100644
--- a/llvm/lib/DWARFLinker/Classic/DWARFStreamer.cpp
+++ b/llvm/lib/DWARFLinker/Classic/DWARFStreamer.cpp
@@ -55,7 +55,7 @@ Error DwarfStreamer::init(Triple TheTriple,
   TripleName = TheTriple.getTriple();
 
   // Create all the MC Objects.
-  MRI.reset(TheTarget->createMCRegInfo(TripleName));
+  MRI.reset(TheTarget->createMCRegInfo(TheTriple));
   if (!MRI)
     return createStringError(std::errc::invalid_argument,
                              "no register info for target %s",
@@ -64,12 +64,12 @@ Error DwarfStreamer::init(Triple TheTriple,
   MCTargetOptions MCOptions = mc::InitMCTargetOptionsFromFlags();
   MCOptions.AsmVerbose = true;
   MCOptions.MCUseDwarfDirectory = MCTargetOptions::EnableDwarfDirectory;
-  MAI.reset(TheTarget->createMCAsmInfo(*MRI, TripleName, MCOptions));
+  MAI.reset(TheTarget->createMCAsmInfo(*MRI, TheTriple, MCOptions));
   if (!MAI)
     return createStringError(std::errc::invalid_argument,
                              "no asm info for target %s", TripleName.c_str());
 
-  MSTI.reset(TheTarget->createMCSubtargetInfo(TripleName, "", ""));
+  MSTI.reset(TheTarget->createMCSubtargetInfo(TheTriple, "", ""));
   if (!MSTI)
     return createStringError(std::errc::invalid_argument,
                              "no subtarget info for target %s",
diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFEmitterImpl.cpp b/llvm/lib/DWARFLinker/Parallel/DWARFEmitterImpl.cpp
index 379f60b0bfb9..9222235d7a41 100644
--- a/llvm/lib/DWARFLinker/Parallel/DWARFEmitterImpl.cpp
+++ b/llvm/lib/DWARFLinker/Parallel/DWARFEmitterImpl.cpp
@@ -35,7 +35,7 @@ Error DwarfEmitterImpl::init(Triple TheTriple,
   TripleName = TheTriple.getTriple();
 
   // Create all the MC Objects.
-  MRI.reset(TheTarget->createMCRegInfo(TripleName));
+  MRI.reset(TheTarget->createMCRegInfo(TheTriple));
   if (!MRI)
     return createStringError(std::errc::invalid_argument,
                              "no register info for target %s",
@@ -44,12 +44,12 @@ Error DwarfEmitterImpl::init(Triple TheTriple,
   MCTargetOptions MCOptions = mc::InitMCTargetOptionsFromFlags();
   MCOptions.AsmVerbose = true;
   MCOptions.MCUseDwarfDirectory = MCTargetOptions::EnableDwarfDirectory;
-  MAI.reset(TheTarget->createMCAsmInfo(*MRI, TripleName, MCOptions));
+  MAI.reset(TheTarget->createMCAsmInfo(*MRI, TheTriple, MCOptions));
   if (!MAI)
     return createStringError(std::errc::invalid_argument,
                              "no asm info for target %s", TripleName.c_str());
 
-  MSTI.reset(TheTarget->createMCSubtargetInfo(TripleName, "", ""));
+  MSTI.reset(TheTarget->createMCSubtargetInfo(TheTriple, "", ""));
   if (!MSTI)
     return createStringError(std::errc::invalid_argument,
                              "no subtarget info for target %s",
diff --git a/llvm/lib/DWARFLinker/Parallel/DebugLineSectionEmitter.h b/llvm/lib/DWARFLinker/Parallel/DebugLineSectionEmitter.h
index b035c4b1d6c3..03c0566f58f8 100644
--- a/llvm/lib/DWARFLinker/Parallel/DebugLineSectionEmitter.h
+++ b/llvm/lib/DWARFLinker/Parallel/DebugLineSectionEmitter.h
@@ -73,19 +73,19 @@ private:
     TripleName = TheTriple.getTriple();
 
     // Create all the MC Objects.
-    MRI.reset(TheTarget->createMCRegInfo(TripleName));
+    MRI.reset(TheTarget->createMCRegInfo(TheTriple));
     if (!MRI)
       return createStringError(std::errc::invalid_argument,
                                "no register info for target %s",
                                TripleName.c_str());
 
     MCTargetOptions MCOptions = mc::InitMCTargetOptionsFromFlags();
-    MAI.reset(TheTarget->createMCAsmInfo(*MRI, TripleName, MCOptions));
+    MAI.reset(TheTarget->createMCAsmInfo(*MRI, TheTriple, MCOptions));
     if (!MAI)
       return createStringError(std::errc::invalid_argument,
                                "no asm info for target %s", TripleName.c_str());
 
-    MSTI.reset(TheTarget->createMCSubtargetInfo(TripleName, "", ""));
+    MSTI.reset(TheTarget->createMCSubtargetInfo(TheTriple, "", ""));
     if (!MSTI)
       return createStringError(std::errc::invalid_argument,
                                "no subtarget info for target %s",
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFExpressionPrinter.cpp b/llvm/lib/DebugInfo/DWARF/DWARFExpressionPrinter.cpp
index ebcd4dda5048..078ebf4e7c03 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFExpressionPrinter.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFExpressionPrinter.cpp
@@ -48,13 +48,52 @@ static bool printOp(const DWARFExpression::Operation *Op, raw_ostream &OS,
                     DIDumpOptions DumpOpts, const DWARFExpression *Expr,
                     DWARFUnit *U) {
   if (Op->isError()) {
-    OS << "<decoding error>";
+    if (!DumpOpts.PrintRegisterOnly)
+      OS << "<decoding error>";
     return false;
   }
 
-  StringRef Name = OperationEncodingString(Op->getCode());
-  assert(!Name.empty() && "DW_OP has no name!");
-  OS << Name;
+  // In "register-only" mode, still show simple constant-valued locations.
+  // This lets clients print annotations like "i = 0" when the location is
+  // a constant (e.g. DW_OP_constu/consts ... DW_OP_stack_value).
+  // We continue to suppress all other non-register ops in this mode.
+  if (DumpOpts.PrintRegisterOnly) {
+    // First, try pretty-printing registers (existing behavior below also does
+    // this, but we need to short-circuit here to avoid printing opcode names).
+    if ((Op->getCode() >= DW_OP_breg0 && Op->getCode() <= DW_OP_breg31) ||
+        (Op->getCode() >= DW_OP_reg0 && Op->getCode() <= DW_OP_reg31) ||
+        Op->getCode() == DW_OP_bregx || Op->getCode() == DW_OP_regx ||
+        Op->getCode() == DW_OP_regval_type) {
+      if (prettyPrintRegisterOp(U, OS, DumpOpts, Op->getCode(),
+                                Op->getRawOperands()))
+        return true;
+      // If we couldn't pretty-print, fall through and suppress.
+    }
+
+    // Show constants (decimal), suppress everything else.
+    if (Op->getCode() == DW_OP_constu) {
+      OS << (uint64_t)Op->getRawOperand(0);
+      return true;
+    }
+    if (Op->getCode() == DW_OP_consts) {
+      OS << (int64_t)Op->getRawOperand(0);
+      return true;
+    }
+    if (Op->getCode() >= DW_OP_lit0 && Op->getCode() <= DW_OP_lit31) {
+      OS << (unsigned)(Op->getCode() - DW_OP_lit0);
+      return true;
+    }
+    if (Op->getCode() == DW_OP_stack_value)
+      return true; // metadata; don't print a token
+
+    return true; // suppress other opcodes silently in register-only mode
+  }
+
+  if (!DumpOpts.PrintRegisterOnly) {
+    StringRef Name = OperationEncodingString(Op->getCode());
+    assert(!Name.empty() && "DW_OP has no name!");
+    OS << Name;
+  }
 
   if ((Op->getCode() >= DW_OP_breg0 && Op->getCode() <= DW_OP_breg31) ||
       (Op->getCode() >= DW_OP_reg0 && Op->getCode() <= DW_OP_reg31) ||
@@ -64,48 +103,51 @@ static bool printOp(const DWARFExpression::Operation *Op, raw_ostream &OS,
                               Op->getRawOperands()))
       return true;
 
-  for (unsigned Operand = 0; Operand < Op->getDescription().Op.size();
-       ++Operand) {
-    unsigned Size = Op->getDescription().Op[Operand];
-    unsigned Signed = Size & DWARFExpression::Operation::SignBit;
-
-    if (Size == DWARFExpression::Operation::SizeSubOpLEB) {
-      StringRef SubName =
-          SubOperationEncodingString(Op->getCode(), Op->getRawOperand(Operand));
-      assert(!SubName.empty() && "DW_OP SubOp has no name!");
-      OS << " " << SubName;
-    } else if (Size == DWARFExpression::Operation::BaseTypeRef && U) {
-      // For DW_OP_convert the operand may be 0 to indicate that conversion to
-      // the generic type should be done. The same holds for DW_OP_reinterpret,
-      // which is currently not supported.
-      if (Op->getCode() == DW_OP_convert && Op->getRawOperand(Operand) == 0)
-        OS << " 0x0";
-      else
-        prettyPrintBaseTypeRef(U, OS, DumpOpts, Op->getRawOperands(), Operand);
-    } else if (Size == DWARFExpression::Operation::WasmLocationArg) {
-      assert(Operand == 1);
-      switch (Op->getRawOperand(0)) {
-      case 0:
-      case 1:
-      case 2:
-      case 3: // global as uint32
-      case 4:
-        OS << format(" 0x%" PRIx64, Op->getRawOperand(Operand));
-        break;
-      default:
-        assert(false);
+  if (!DumpOpts.PrintRegisterOnly) {
+    for (unsigned Operand = 0; Operand < Op->getDescription().Op.size();
+         ++Operand) {
+      unsigned Size = Op->getDescription().Op[Operand];
+      unsigned Signed = Size & DWARFExpression::Operation::SignBit;
+
+      if (Size == DWARFExpression::Operation::SizeSubOpLEB) {
+        StringRef SubName = SubOperationEncodingString(
+            Op->getCode(), Op->getRawOperand(Operand));
+        assert(!SubName.empty() && "DW_OP SubOp has no name!");
+        OS << " " << SubName;
+      } else if (Size == DWARFExpression::Operation::BaseTypeRef && U) {
+        // For DW_OP_convert the operand may be 0 to indicate that conversion to
+        // the generic type should be done. The same holds for
+        // DW_OP_reinterpret, which is currently not supported.
+        if (Op->getCode() == DW_OP_convert && Op->getRawOperand(Operand) == 0)
+          OS << " 0x0";
+        else
+          prettyPrintBaseTypeRef(U, OS, DumpOpts, Op->getRawOperands(),
+                                 Operand);
+      } else if (Size == DWARFExpression::Operation::WasmLocationArg) {
+        assert(Operand == 1);
+        switch (Op->getRawOperand(0)) {
+        case 0:
+        case 1:
+        case 2:
+        case 3: // global as uint32
+        case 4:
+          OS << format(" 0x%" PRIx64, Op->getRawOperand(Operand));
+          break;
+        default:
+          assert(false);
+        }
+      } else if (Size == DWARFExpression::Operation::SizeBlock) {
+        uint64_t Offset = Op->getRawOperand(Operand);
+        for (unsigned i = 0; i < Op->getRawOperand(Operand - 1); ++i)
+          OS << format(" 0x%02x",
+                       static_cast<uint8_t>(Expr->getData()[Offset++]));
+      } else {
+        if (Signed)
+          OS << format(" %+" PRId64, (int64_t)Op->getRawOperand(Operand));
+        else if (Op->getCode() != DW_OP_entry_value &&
+                 Op->getCode() != DW_OP_GNU_entry_value)
+          OS << format(" 0x%" PRIx64, Op->getRawOperand(Operand));
       }
-    } else if (Size == DWARFExpression::Operation::SizeBlock) {
-      uint64_t Offset = Op->getRawOperand(Operand);
-      for (unsigned i = 0; i < Op->getRawOperand(Operand - 1); ++i)
-        OS << format(" 0x%02x",
-                     static_cast<uint8_t>(Expr->getData()[Offset++]));
-    } else {
-      if (Signed)
-        OS << format(" %+" PRId64, (int64_t)Op->getRawOperand(Operand));
-      else if (Op->getCode() != DW_OP_entry_value &&
-               Op->getCode() != DW_OP_GNU_entry_value)
-        OS << format(" 0x%" PRIx64, Op->getRawOperand(Operand));
     }
   }
   return true;
@@ -120,29 +162,30 @@ void printDwarfExpression(const DWARFExpression *E, raw_ostream &OS,
 
   for (auto &Op : *E) {
     DumpOpts.IsEH = IsEH;
-    if (!printOp(&Op, OS, DumpOpts, E, U)) {
+    if (!printOp(&Op, OS, DumpOpts, E, U) && !DumpOpts.PrintRegisterOnly) {
       uint64_t FailOffset = Op.getEndOffset();
       while (FailOffset < E->getData().size())
         OS << format(" %02x", static_cast<uint8_t>(E->getData()[FailOffset++]));
       return;
     }
+    if (!DumpOpts.PrintRegisterOnly) {
+      if (Op.getCode() == DW_OP_entry_value ||
+          Op.getCode() == DW_OP_GNU_entry_value) {
+        OS << "(";
+        EntryValExprSize = Op.getRawOperand(0);
+        EntryValStartOffset = Op.getEndOffset();
+        continue;
+      }
 
-    if (Op.getCode() == DW_OP_entry_value ||
-        Op.getCode() == DW_OP_GNU_entry_value) {
-      OS << "(";
-      EntryValExprSize = Op.getRawOperand(0);
-      EntryValStartOffset = Op.getEndOffset();
-      continue;
-    }
+      if (EntryValExprSize) {
+        EntryValExprSize -= Op.getEndOffset() - EntryValStartOffset;
+        if (EntryValExprSize == 0)
+          OS << ")";
+      }
 
-    if (EntryValExprSize) {
-      EntryValExprSize -= Op.getEndOffset() - EntryValStartOffset;
-      if (EntryValExprSize == 0)
-        OS << ")";
+      if (Op.getEndOffset() < E->getData().size())
+        OS << ", ";
     }
-
-    if (Op.getEndOffset() < E->getData().size())
-      OS << ", ";
   }
 }
 
diff --git a/llvm/lib/DebugInfo/GSYM/GsymContext.cpp b/llvm/lib/DebugInfo/GSYM/GsymContext.cpp
index 18be6d098546..62b4caa327d8 100644
--- a/llvm/lib/DebugInfo/GSYM/GsymContext.cpp
+++ b/llvm/lib/DebugInfo/GSYM/GsymContext.cpp
@@ -14,6 +14,7 @@
 using namespace llvm;
 using namespace llvm::gsym;
 
+GsymContext::~GsymContext() = default;
 GsymContext::GsymContext(std::unique_ptr<GsymReader> Reader)
     : DIContext(CK_GSYM), Reader(std::move(Reader)) {}
 
diff --git a/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp b/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp
index 0df9137a3bd3..0d0383158dd4 100644
--- a/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp
+++ b/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp
@@ -274,9 +274,10 @@ void LVBinaryReader::mapVirtualAddress(const object::COFFObjectFile &COFFObj) {
   });
 }
 
-Error LVBinaryReader::loadGenericTargetInfo(StringRef TheTriple,
+Error LVBinaryReader::loadGenericTargetInfo(StringRef TripleName,
                                             StringRef TheFeatures,
                                             StringRef TheCPU) {
+  Triple TheTriple(TripleName);
   std::string TargetLookupError;
   const Target *TheTarget =
       TargetRegistry::lookupTarget(TheTriple, TargetLookupError);
@@ -287,7 +288,7 @@ Error LVBinaryReader::loadGenericTargetInfo(StringRef TheTriple,
   MCRegisterInfo *RegisterInfo = TheTarget->createMCRegInfo(TheTriple);
   if (!RegisterInfo)
     return createStringError(errc::invalid_argument,
-                             "no register info for target " + TheTriple);
+                             "no register info for target " + TripleName);
   MRI.reset(RegisterInfo);
 
   // Assembler properties and features.
@@ -295,7 +296,7 @@ Error LVBinaryReader::loadGenericTargetInfo(StringRef TheTriple,
   MCAsmInfo *AsmInfo(TheTarget->createMCAsmInfo(*MRI, TheTriple, MCOptions));
   if (!AsmInfo)
     return createStringError(errc::invalid_argument,
-                             "no assembly info for target " + TheTriple);
+                             "no assembly info for target " + TripleName);
   MAI.reset(AsmInfo);
 
   // Target subtargets.
@@ -303,14 +304,14 @@ Error LVBinaryReader::loadGenericTargetInfo(StringRef TheTriple,
       TheTarget->createMCSubtargetInfo(TheTriple, TheCPU, TheFeatures));
   if (!SubtargetInfo)
     return createStringError(errc::invalid_argument,
-                             "no subtarget info for target " + TheTriple);
+                             "no subtarget info for target " + TripleName);
   STI.reset(SubtargetInfo);
 
   // Instructions Info.
   MCInstrInfo *InstructionInfo(TheTarget->createMCInstrInfo());
   if (!InstructionInfo)
     return createStringError(errc::invalid_argument,
-                             "no instruction info for target " + TheTriple);
+                             "no instruction info for target " + TripleName);
   MII.reset(InstructionInfo);
 
   MC = std::make_unique<MCContext>(Triple(TheTriple), MAI.get(), MRI.get(),
@@ -320,7 +321,7 @@ Error LVBinaryReader::loadGenericTargetInfo(StringRef TheTriple,
   MCDisassembler *DisAsm(TheTarget->createMCDisassembler(*STI, *MC));
   if (!DisAsm)
     return createStringError(errc::invalid_argument,
-                             "no disassembler for target " + TheTriple);
+                             "no disassembler for target " + TripleName);
   MD.reset(DisAsm);
 
   MCInstPrinter *InstructionPrinter(TheTarget->createMCInstPrinter(
@@ -328,7 +329,7 @@ Error LVBinaryReader::loadGenericTargetInfo(StringRef TheTriple,
   if (!InstructionPrinter)
     return createStringError(errc::invalid_argument,
                              "no target assembly language printer for target " +
-                                 TheTriple);
+                                 TripleName);
   MIP.reset(InstructionPrinter);
   InstructionPrinter->setPrintImmHex(true);
 
diff --git a/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp b/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp
index 62134dfdadf4..3ba506171814 100644
--- a/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp
+++ b/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp
@@ -274,7 +274,7 @@ void LVDWARFReader::processOneAttribute(const DWARFDie &Die,
       for (DWARFAddressRange &Range : Ranges) {
         // This seems to be a tombstone for empty ranges.
         if ((Range.LowPC == Range.HighPC) ||
-            (Range.LowPC = getTombstoneAddress()))
+            (Range.LowPC == getTombstoneAddress()))
           continue;
         // Store the real upper limit for the address range.
         if (UpdateHighAddress && Range.HighPC > 0)
@@ -461,13 +461,17 @@ LVScope *LVDWARFReader::processOneDie(const DWARFDie &InputDIE, LVScope *Parent,
         if (!CurrentRanges.empty()) {
           for (LVAddressRange &Range : CurrentRanges)
             addSectionRange(SectionIndex, CurrentScope, Range.first,
-                            Range.second);
+                            Range.second > Range.first
+                                ? Range.second - 1 // Make hi-pc exclusive
+                                : Range.second);
           CurrentRanges.clear();
         }
         // If the scope is the CU, do not update the ranges set.
         if (FoundLowPC && FoundHighPC && !IsCompileUnit) {
           addSectionRange(SectionIndex, CurrentScope, CurrentLowPC,
-                          CurrentHighPC);
+                          CurrentHighPC > CurrentLowPC
+                              ? CurrentHighPC - 1 // Make hi-pc exclusive
+                              : CurrentHighPC);
         }
       }
     }
diff --git a/llvm/lib/DebugInfo/PDB/Native/InputFile.cpp b/llvm/lib/DebugInfo/PDB/Native/InputFile.cpp
index 328d0f5ab060..49be0edc33a1 100644
--- a/llvm/lib/DebugInfo/PDB/Native/InputFile.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/InputFile.cpp
@@ -586,3 +586,8 @@ bool llvm::pdb::shouldDumpSymbolGroup(uint32_t Idx, const SymbolGroup &Group,
   // Otherwise, only dump if this is the same module specified.
   return (Filters.DumpModi == Idx);
 }
+llvm::pdb::InputFile::InputFile(PDBFile *Pdb) { PdbOrObj = Pdb; }
+
+llvm::pdb::InputFile::InputFile(object::COFFObjectFile *Obj) { PdbOrObj = Obj; }
+
+llvm::pdb::InputFile::InputFile(MemoryBuffer *Buffer) { PdbOrObj = Buffer; }
diff --git a/llvm/lib/DebugInfo/PDB/Native/PublicsStream.cpp b/llvm/lib/DebugInfo/PDB/Native/PublicsStream.cpp
index c350e0e0b3e1..0453eea26605 100644
--- a/llvm/lib/DebugInfo/PDB/Native/PublicsStream.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/PublicsStream.cpp
@@ -22,9 +22,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/Native/PublicsStream.h"
+#include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
+#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/DebugInfo/PDB/Native/SymbolStream.h"
 #include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Error.h"
 #include <cstdint>
@@ -96,3 +99,50 @@ Error PublicsStream::reload() {
                                 "Corrupted publics stream.");
   return Error::success();
 }
+
+// This is a reimplementation of NearestSym:
+// https://github.com/microsoft/microsoft-pdb/blob/805655a28bd8198004be2ac27e6e0290121a5e89/PDB/dbi/gsi.cpp#L1492-L1581
+std::optional<std::pair<codeview::PublicSym32, size_t>>
+PublicsStream::findByAddress(const SymbolStream &Symbols, uint16_t Segment,
+                             uint32_t Offset) const {
+  // The address map is sorted by address, so we can use lower_bound to find the
+  // position. Each element is an offset into the symbols for a public symbol.
+  auto It = llvm::lower_bound(
+      AddressMap, std::tuple(Segment, Offset),
+      [&](support::ulittle32_t Cur, auto Addr) {
+        auto Sym = Symbols.readRecord(Cur.value());
+        if (Sym.kind() != codeview::S_PUB32)
+          return false; // stop here, this is most likely corrupted debug info
+
+        auto Psym =
+            codeview::SymbolDeserializer::deserializeAs<codeview::PublicSym32>(
+                Sym);
+        if (!Psym) {
+          consumeError(Psym.takeError());
+          return false;
+        }
+
+        return std::tie(Psym->Segment, Psym->Offset) < Addr;
+      });
+
+  if (It == AddressMap.end())
+    return std::nullopt;
+
+  auto Sym = Symbols.readRecord(It->value());
+  if (Sym.kind() != codeview::S_PUB32)
+    return std::nullopt; // this is most likely corrupted debug info
+
+  auto MaybePsym =
+      codeview::SymbolDeserializer::deserializeAs<codeview::PublicSym32>(Sym);
+  if (!MaybePsym) {
+    consumeError(MaybePsym.takeError());
+    return std::nullopt;
+  }
+  codeview::PublicSym32 Psym = std::move(*MaybePsym);
+
+  if (std::tuple(Segment, Offset) != std::tuple(Psym.Segment, Psym.Offset))
+    return std::nullopt;
+
+  std::ptrdiff_t IterOffset = It - AddressMap.begin();
+  return std::pair{Psym, static_cast<size_t>(IterOffset)};
+}
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp
index eca2a09c1f77..7c8ef18f126d 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp
@@ -10,6 +10,12 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 
+namespace llvm {
+namespace pdb {
+PDBSymbolTypeBuiltin::~PDBSymbolTypeBuiltin() = default;
+} // namespace pdb
+} // namespace llvm
+
 using namespace llvm;
 using namespace llvm::pdb;
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp
index 71a0f14368ac..0dc97104610b 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp
@@ -154,18 +154,22 @@ public:
                      std::unique_ptr<LinkGraph> G, PassConfiguration PassConfig)
       : JITLinker(std::move(Ctx), std::move(G), std::move(PassConfig)) {
     JITLinkerBase::getPassConfig().PostAllocationPasses.push_back(
-        [this](LinkGraph &G) { return gatherRISCVPCRelHi20(G); });
+        [this](LinkGraph &G) { return gatherRISCVPairs(G); });
   }
 
 private:
   DenseMap<std::pair<const Block *, orc::ExecutorAddrDiff>, const Edge *>
       RelHi20;
+  DenseMap<std::pair<const Block *, orc::ExecutorAddrDiff>, const Edge *>
+      SetULEB128;
 
-  Error gatherRISCVPCRelHi20(LinkGraph &G) {
+  Error gatherRISCVPairs(LinkGraph &G) {
     for (Block *B : G.blocks())
       for (Edge &E : B->edges())
         if (E.getKind() == R_RISCV_PCREL_HI20)
           RelHi20[{B, E.getOffset()}] = &E;
+        else if (E.getKind() == R_RISCV_SET_ULEB128)
+          SetULEB128[{B, E.getOffset()}] = &E;
 
     return Error::success();
   }
@@ -189,6 +193,20 @@ private:
                                     "for LO12 PCREL relocation type");
   }
 
+  Expected<const Edge &> getRISCVSetULEB128(const Block &B,
+                                            const Edge &E) const {
+    using namespace riscv;
+    assert(E.getKind() == R_RISCV_SUB_ULEB128 &&
+           "Can only have pair relocation for R_RISCV_SUB_ULEB128");
+
+    auto It = SetULEB128.find({&B, E.getOffset()});
+    if (It != SetULEB128.end())
+      return *It->second;
+
+    return make_error<JITLinkError>(
+        "No RISCV_SET_ULEB128 relocation type be found");
+  }
+
   Error applyFixup(LinkGraph &G, Block &B, const Edge &E) const {
     using namespace riscv;
     using namespace llvm::support;
@@ -467,6 +485,21 @@ private:
       *(little32_t *)FixupPtr = static_cast<uint32_t>(Value);
       break;
     }
+    case R_RISCV_SET_ULEB128:
+      break;
+    case R_RISCV_SUB_ULEB128: {
+      auto SetULEB128 = getRISCVSetULEB128(B, E);
+      if (!SetULEB128)
+        return SetULEB128.takeError();
+      uint64_t Value = SetULEB128->getTarget().getAddress() +
+                       SetULEB128->getAddend() - E.getTarget().getAddress() -
+                       E.getAddend();
+      if (overwriteULEB128(reinterpret_cast<uint8_t *>(FixupPtr), Value) >=
+          0x80)
+        return make_error<StringError>("ULEB128 value exceeds available space",
+                                       inconvertibleErrorCode());
+      break;
+    }
     }
     return Error::success();
   }
@@ -843,6 +876,10 @@ private:
       return EdgeKind_riscv::R_RISCV_32_PCREL;
     case ELF::R_RISCV_ALIGN:
       return EdgeKind_riscv::AlignRelaxable;
+    case ELF::R_RISCV_SET_ULEB128:
+      return EdgeKind_riscv::R_RISCV_SET_ULEB128;
+    case ELF::R_RISCV_SUB_ULEB128:
+      return EdgeKind_riscv::R_RISCV_SUB_ULEB128;
     }
 
     return make_error<JITLinkError>(
diff --git a/llvm/lib/ExecutionEngine/JITLink/riscv.cpp b/llvm/lib/ExecutionEngine/JITLink/riscv.cpp
index a4e4daef97fb..9e9f4433a9fc 100644
--- a/llvm/lib/ExecutionEngine/JITLink/riscv.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/riscv.cpp
@@ -84,6 +84,10 @@ const char *getEdgeKindName(Edge::Kind K) {
     return "AlignRelaxable";
   case NegDelta32:
     return "NegDelta32";
+  case R_RISCV_SET_ULEB128:
+    return "R_RISCV_SET_ULEB128";
+  case R_RISCV_SUB_ULEB128:
+    return "R_RISCV_SUB_ULEB128";
   }
   return getGenericEdgeKindName(K);
 }
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
index ff48a938cbd4..afe3b671547d 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
@@ -756,59 +756,56 @@ private:
 
   Expected<TargetInfo> getTargetInfo(const Triple &TT, const StringRef &CPU,
                                      const SubtargetFeatures &TF) const {
-
-    auto TripleName = TT.str();
     std::string ErrorStr;
-    const Target *TheTarget =
-        TargetRegistry::lookupTarget(TripleName, ErrorStr);
+    const Target *TheTarget = TargetRegistry::lookupTarget(TT, ErrorStr);
     if (!TheTarget)
-      return make_error<StringError>("Error accessing target '" + TripleName +
+      return make_error<StringError>("Error accessing target '" + TT.str() +
                                          "': " + ErrorStr,
                                      inconvertibleErrorCode());
 
     std::unique_ptr<MCSubtargetInfo> STI(
-        TheTarget->createMCSubtargetInfo(TripleName, CPU, TF.getString()));
+        TheTarget->createMCSubtargetInfo(TT, CPU, TF.getString()));
     if (!STI)
       return make_error<StringError>("Unable to create subtarget for " +
-                                         TripleName,
+                                         TT.str(),
                                      inconvertibleErrorCode());
 
-    std::unique_ptr<MCRegisterInfo> MRI(TheTarget->createMCRegInfo(TripleName));
+    std::unique_ptr<MCRegisterInfo> MRI(TheTarget->createMCRegInfo(TT));
     if (!MRI)
       return make_error<StringError>("Unable to create target register info "
                                      "for " +
-                                         TripleName,
+                                         TT.str(),
                                      inconvertibleErrorCode());
 
     MCTargetOptions MCOptions;
     std::unique_ptr<MCAsmInfo> MAI(
-        TheTarget->createMCAsmInfo(*MRI, TripleName, MCOptions));
+        TheTarget->createMCAsmInfo(*MRI, TT, MCOptions));
     if (!MAI)
       return make_error<StringError>("Unable to create target asm info " +
-                                         TripleName,
+                                         TT.str(),
                                      inconvertibleErrorCode());
 
-    auto Ctx = std::make_unique<MCContext>(Triple(TripleName), MAI.get(),
+    auto Ctx = std::make_unique<MCContext>(Triple(TT.str()), MAI.get(),
                                            MRI.get(), STI.get());
 
     std::unique_ptr<MCDisassembler> Disassembler(
         TheTarget->createMCDisassembler(*STI, *Ctx));
     if (!Disassembler)
       return make_error<StringError>("Unable to create disassembler for " +
-                                         TripleName,
+                                         TT.str(),
                                      inconvertibleErrorCode());
 
     std::unique_ptr<MCInstrInfo> MII(TheTarget->createMCInstrInfo());
     if (!MII)
       return make_error<StringError>("Unable to create instruction info for" +
-                                         TripleName,
+                                         TT.str(),
                                      inconvertibleErrorCode());
 
-    std::unique_ptr<MCInstPrinter> InstPrinter(TheTarget->createMCInstPrinter(
-        Triple(TripleName), 0, *MAI, *MII, *MRI));
+    std::unique_ptr<MCInstPrinter> InstPrinter(
+        TheTarget->createMCInstPrinter(TT, 0, *MAI, *MII, *MRI));
     if (!InstPrinter)
       return make_error<StringError>(
-          "Unable to create instruction printer for" + TripleName,
+          "Unable to create instruction printer for" + TT.str(),
           inconvertibleErrorCode());
 
     return TargetInfo({TheTarget, std::move(STI), std::move(MRI),
diff --git a/llvm/lib/FileCheck/FileCheck.cpp b/llvm/lib/FileCheck/FileCheck.cpp
index ce35a5bad761..9245db442611 100644
--- a/llvm/lib/FileCheck/FileCheck.cpp
+++ b/llvm/lib/FileCheck/FileCheck.cpp
@@ -1218,6 +1218,14 @@ Pattern::MatchResult Pattern::match(StringRef Buffer,
     StringRef MatchedValue = MatchInfo[CaptureParenGroup];
     ExpressionFormat Format = DefinedNumericVariable->getImplicitFormat();
     APInt Value = Format.valueFromStringRepr(MatchedValue, SM);
+    // Numeric variables are already inserted into GlobalNumericVariableTable
+    // during parsing, but clearLocalVars might remove them, so we must
+    // reinsert them. Numeric-variable resolution does not access
+    // GlobalNumericVariableTable; it directly uses a pointer to the variable.
+    // However, other functions (such as clearLocalVars) may require active
+    // variables to be in the table.
+    Context->GlobalNumericVariableTable.try_emplace(NumericVariableDef.getKey(),
+                                                    DefinedNumericVariable);
     DefinedNumericVariable->setValue(Value, MatchedValue);
   }
 
diff --git a/llvm/lib/Frontend/HLSL/HLSLBinding.cpp b/llvm/lib/Frontend/HLSL/HLSLBinding.cpp
index 45391460354d..401402fb5a7b 100644
--- a/llvm/lib/Frontend/HLSL/HLSLBinding.cpp
+++ b/llvm/lib/Frontend/HLSL/HLSLBinding.cpp
@@ -131,9 +131,9 @@ BindingInfo BindingInfoBuilder::calculateBindingInfo(
   return Info;
 }
 
-const BindingInfoBuilder::Binding &BindingInfoBuilder::findOverlapping(
-    const BindingInfoBuilder::Binding &ReportedBinding) const {
-  for (const BindingInfoBuilder::Binding &Other : Bindings)
+const Binding &
+BindingInfoBuilder::findOverlapping(const Binding &ReportedBinding) const {
+  for (const Binding &Other : Bindings)
     if (ReportedBinding.LowerBound <= Other.UpperBound &&
         Other.LowerBound <= ReportedBinding.UpperBound)
       return Other;
diff --git a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
index dece8f197aaf..31605e390034 100644
--- a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
+++ b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
@@ -52,6 +52,17 @@ static std::optional<StringRef> extractMdStringValue(MDNode *Node,
   return NodeText->getString();
 }
 
+static Expected<dxbc::ShaderVisibility>
+extractShaderVisibility(MDNode *Node, unsigned int OpId) {
+  if (std::optional<uint32_t> Val = extractMdIntValue(Node, OpId)) {
+    if (!dxbc::isValidShaderVisibility(*Val))
+      return make_error<RootSignatureValidationError<uint32_t>>(
+          "ShaderVisibility", *Val);
+    return dxbc::ShaderVisibility(*Val);
+  }
+  return make_error<InvalidRSMetadataValue>("ShaderVisibility");
+}
+
 namespace {
 
 // We use the OverloadVisit with std::visit to ensure the compiler catches if a
@@ -221,17 +232,12 @@ Error MetadataParser::parseRootConstants(mcdxbc::RootSignatureDesc &RSD,
   if (RootConstantNode->getNumOperands() != 5)
     return make_error<InvalidRSMetadataFormat>("RootConstants Element");
 
-  dxbc::RTS0::v1::RootParameterHeader Header;
-  // The parameter offset doesn't matter here - we recalculate it during
-  // serialization  Header.ParameterOffset = 0;
-  Header.ParameterType = to_underlying(dxbc::RootParameterType::Constants32Bit);
-
-  if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 1))
-    Header.ShaderVisibility = *Val;
-  else
-    return make_error<InvalidRSMetadataValue>("ShaderVisibility");
+  Expected<dxbc::ShaderVisibility> Visibility =
+      extractShaderVisibility(RootConstantNode, 1);
+  if (auto E = Visibility.takeError())
+    return Error(std::move(E));
 
-  dxbc::RTS0::v1::RootConstants Constants;
+  mcdxbc::RootConstants Constants;
   if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 2))
     Constants.ShaderRegister = *Val;
   else
@@ -247,7 +253,8 @@ Error MetadataParser::parseRootConstants(mcdxbc::RootSignatureDesc &RSD,
   else
     return make_error<InvalidRSMetadataValue>("Num32BitValues");
 
-  RSD.ParametersContainer.addParameter(Header, Constants);
+  RSD.ParametersContainer.addParameter(dxbc::RootParameterType::Constants32Bit,
+                                       *Visibility, Constants);
 
   return Error::success();
 }
@@ -263,28 +270,28 @@ Error MetadataParser::parseRootDescriptors(
   if (RootDescriptorNode->getNumOperands() != 5)
     return make_error<InvalidRSMetadataFormat>("Root Descriptor Element");
 
-  dxbc::RTS0::v1::RootParameterHeader Header;
+  dxbc::RootParameterType Type;
   switch (ElementKind) {
   case RootSignatureElementKind::SRV:
-    Header.ParameterType = to_underlying(dxbc::RootParameterType::SRV);
+    Type = dxbc::RootParameterType::SRV;
     break;
   case RootSignatureElementKind::UAV:
-    Header.ParameterType = to_underlying(dxbc::RootParameterType::UAV);
+    Type = dxbc::RootParameterType::UAV;
     break;
   case RootSignatureElementKind::CBV:
-    Header.ParameterType = to_underlying(dxbc::RootParameterType::CBV);
+    Type = dxbc::RootParameterType::CBV;
     break;
   default:
     llvm_unreachable("invalid Root Descriptor kind");
     break;
   }
 
-  if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 1))
-    Header.ShaderVisibility = *Val;
-  else
-    return make_error<InvalidRSMetadataValue>("ShaderVisibility");
+  Expected<dxbc::ShaderVisibility> Visibility =
+      extractShaderVisibility(RootDescriptorNode, 1);
+  if (auto E = Visibility.takeError())
+    return Error(std::move(E));
 
-  dxbc::RTS0::v2::RootDescriptor Descriptor;
+  mcdxbc::RootDescriptor Descriptor;
   if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 2))
     Descriptor.ShaderRegister = *Val;
   else
@@ -296,7 +303,7 @@ Error MetadataParser::parseRootDescriptors(
     return make_error<InvalidRSMetadataValue>("RegisterSpace");
 
   if (RSD.Version == 1) {
-    RSD.ParametersContainer.addParameter(Header, Descriptor);
+    RSD.ParametersContainer.addParameter(Type, *Visibility, Descriptor);
     return Error::success();
   }
   assert(RSD.Version > 1);
@@ -306,7 +313,7 @@ Error MetadataParser::parseRootDescriptors(
   else
     return make_error<InvalidRSMetadataValue>("Root Descriptor Flags");
 
-  RSD.ParametersContainer.addParameter(Header, Descriptor);
+  RSD.ParametersContainer.addParameter(Type, *Visibility, Descriptor);
   return Error::success();
 }
 
@@ -315,7 +322,7 @@ Error MetadataParser::parseDescriptorRange(mcdxbc::DescriptorTable &Table,
   if (RangeDescriptorNode->getNumOperands() != 6)
     return make_error<InvalidRSMetadataFormat>("Descriptor Range");
 
-  dxbc::RTS0::v2::DescriptorRange Range;
+  mcdxbc::DescriptorRange Range;
 
   std::optional<StringRef> ElementText =
       extractMdStringValue(RangeDescriptorNode, 0);
@@ -323,15 +330,15 @@ Error MetadataParser::parseDescriptorRange(mcdxbc::DescriptorTable &Table,
   if (!ElementText.has_value())
     return make_error<InvalidRSMetadataFormat>("Descriptor Range");
 
-  Range.RangeType =
-      StringSwitch<uint32_t>(*ElementText)
-          .Case("CBV", to_underlying(dxbc::DescriptorRangeType::CBV))
-          .Case("SRV", to_underlying(dxbc::DescriptorRangeType::SRV))
-          .Case("UAV", to_underlying(dxbc::DescriptorRangeType::UAV))
-          .Case("Sampler", to_underlying(dxbc::DescriptorRangeType::Sampler))
-          .Default(~0U);
-
-  if (Range.RangeType == ~0U)
+  if (*ElementText == "CBV")
+    Range.RangeType = dxil::ResourceClass::CBuffer;
+  else if (*ElementText == "SRV")
+    Range.RangeType = dxil::ResourceClass::SRV;
+  else if (*ElementText == "UAV")
+    Range.RangeType = dxil::ResourceClass::UAV;
+  else if (*ElementText == "Sampler")
+    Range.RangeType = dxil::ResourceClass::Sampler;
+  else
     return make_error<GenericRSMetadataError>("Invalid Descriptor Range type.",
                                               RangeDescriptorNode);
 
@@ -372,15 +379,12 @@ Error MetadataParser::parseDescriptorTable(mcdxbc::RootSignatureDesc &RSD,
   if (NumOperands < 2)
     return make_error<InvalidRSMetadataFormat>("Descriptor Table");
 
-  dxbc::RTS0::v1::RootParameterHeader Header;
-  if (std::optional<uint32_t> Val = extractMdIntValue(DescriptorTableNode, 1))
-    Header.ShaderVisibility = *Val;
-  else
-    return make_error<InvalidRSMetadataValue>("ShaderVisibility");
+  Expected<dxbc::ShaderVisibility> Visibility =
+      extractShaderVisibility(DescriptorTableNode, 1);
+  if (auto E = Visibility.takeError())
+    return Error(std::move(E));
 
   mcdxbc::DescriptorTable Table;
-  Header.ParameterType =
-      to_underlying(dxbc::RootParameterType::DescriptorTable);
 
   for (unsigned int I = 2; I < NumOperands; I++) {
     MDNode *Element = dyn_cast<MDNode>(DescriptorTableNode->getOperand(I));
@@ -392,7 +396,8 @@ Error MetadataParser::parseDescriptorTable(mcdxbc::RootSignatureDesc &RSD,
       return Err;
   }
 
-  RSD.ParametersContainer.addParameter(Header, Table);
+  RSD.ParametersContainer.addParameter(dxbc::RootParameterType::DescriptorTable,
+                                       *Visibility, Table);
   return Error::success();
 }
 
@@ -528,21 +533,15 @@ Error MetadataParser::validateRootSignature(
   }
 
   for (const mcdxbc::RootParameterInfo &Info : RSD.ParametersContainer) {
-    if (!dxbc::isValidShaderVisibility(Info.Header.ShaderVisibility))
-      DeferredErrs =
-          joinErrors(std::move(DeferredErrs),
-                     make_error<RootSignatureValidationError<uint32_t>>(
-                         "ShaderVisibility", Info.Header.ShaderVisibility));
-
-    assert(dxbc::isValidParameterType(Info.Header.ParameterType) &&
-           "Invalid value for ParameterType");
 
-    switch (Info.Header.ParameterType) {
+    switch (Info.Type) {
+    case dxbc::RootParameterType::Constants32Bit:
+      break;
 
-    case to_underlying(dxbc::RootParameterType::CBV):
-    case to_underlying(dxbc::RootParameterType::UAV):
-    case to_underlying(dxbc::RootParameterType::SRV): {
-      const dxbc::RTS0::v2::RootDescriptor &Descriptor =
+    case dxbc::RootParameterType::CBV:
+    case dxbc::RootParameterType::UAV:
+    case dxbc::RootParameterType::SRV: {
+      const mcdxbc::RootDescriptor &Descriptor =
           RSD.ParametersContainer.getRootDescriptor(Info.Location);
       if (!hlsl::rootsig::verifyRegisterValue(Descriptor.ShaderRegister))
         DeferredErrs =
@@ -566,16 +565,10 @@ Error MetadataParser::validateRootSignature(
       }
       break;
     }
-    case to_underlying(dxbc::RootParameterType::DescriptorTable): {
+    case dxbc::RootParameterType::DescriptorTable: {
       const mcdxbc::DescriptorTable &Table =
           RSD.ParametersContainer.getDescriptorTable(Info.Location);
-      for (const dxbc::RTS0::v2::DescriptorRange &Range : Table) {
-        if (!hlsl::rootsig::verifyRangeType(Range.RangeType))
-          DeferredErrs =
-              joinErrors(std::move(DeferredErrs),
-                         make_error<RootSignatureValidationError<uint32_t>>(
-                             "RangeType", Range.RangeType));
-
+      for (const mcdxbc::DescriptorRange &Range : Table) {
         if (!hlsl::rootsig::verifyRegisterSpace(Range.RegisterSpace))
           DeferredErrs =
               joinErrors(std::move(DeferredErrs),
@@ -589,7 +582,8 @@ Error MetadataParser::validateRootSignature(
                              "NumDescriptors", Range.NumDescriptors));
 
         if (!hlsl::rootsig::verifyDescriptorRangeFlag(
-                RSD.Version, Range.RangeType, Range.Flags))
+                RSD.Version, Range.RangeType,
+                dxbc::DescriptorRangeFlags(Range.Flags)))
           DeferredErrs =
               joinErrors(std::move(DeferredErrs),
                          make_error<RootSignatureValidationError<uint32_t>>(
diff --git a/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp b/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp
index 72308a3de5fd..d682dda0bab2 100644
--- a/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp
+++ b/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp
@@ -51,25 +51,11 @@ bool verifyRootDescriptorFlag(uint32_t Version, uint32_t FlagsVal) {
   return (Flags | DataFlags) == DataFlags;
 }
 
-bool verifyRangeType(uint32_t Type) {
-  switch (Type) {
-  case llvm::to_underlying(dxbc::DescriptorRangeType::CBV):
-  case llvm::to_underlying(dxbc::DescriptorRangeType::SRV):
-  case llvm::to_underlying(dxbc::DescriptorRangeType::UAV):
-  case llvm::to_underlying(dxbc::DescriptorRangeType::Sampler):
-    return true;
-  };
-
-  return false;
-}
-
-bool verifyDescriptorRangeFlag(uint32_t Version, uint32_t Type,
-                               uint32_t FlagsVal) {
+bool verifyDescriptorRangeFlag(uint32_t Version, dxil::ResourceClass Type,
+                               dxbc::DescriptorRangeFlags Flags) {
   using FlagT = dxbc::DescriptorRangeFlags;
-  FlagT Flags = FlagT(FlagsVal);
 
-  const bool IsSampler =
-      (Type == llvm::to_underlying(dxbc::DescriptorRangeType::Sampler));
+  const bool IsSampler = (Type == dxil::ResourceClass::Sampler);
 
   if (Version == 1) {
     // Since the metadata is unversioned, we expect to explicitly see the values
@@ -180,6 +166,22 @@ bool verifyBorderColor(uint32_t BorderColor) {
 
 bool verifyLOD(float LOD) { return !std::isnan(LOD); }
 
+bool verifyBoundOffset(uint32_t Offset) {
+  return Offset != NumDescriptorsUnbounded;
+}
+
+bool verifyNoOverflowedOffset(uint64_t Offset) {
+  return Offset <= std::numeric_limits<uint32_t>::max();
+}
+
+uint64_t computeRangeBound(uint32_t Offset, uint32_t Size) {
+  assert(0 < Size && "Must be a non-empty range");
+  if (Size == NumDescriptorsUnbounded)
+    return NumDescriptorsUnbounded;
+
+  return uint64_t(Offset) + uint64_t(Size) - 1;
+}
+
 } // namespace rootsig
 } // namespace hlsl
 } // namespace llvm
diff --git a/llvm/lib/Frontend/OpenMP/OMP.cpp b/llvm/lib/Frontend/OpenMP/OMP.cpp
index 9e625b809de9..f12941492547 100644
--- a/llvm/lib/Frontend/OpenMP/OMP.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMP.cpp
@@ -9,6 +9,8 @@
 #include "llvm/Frontend/OpenMP/OMP.h"
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Demangle/Demangle.h"
@@ -75,6 +77,26 @@ getFirstCompositeRange(iterator_range<ArrayRef<Directive>::iterator> Leafs) {
   return llvm::make_range(Begin, End);
 }
 
+static void
+collectPrivatizingConstructs(llvm::SmallSet<Directive, 16> &Constructs,
+                             unsigned Version) {
+  llvm::SmallSet<Clause, 16> Privatizing;
+  for (auto C :
+       llvm::enum_seq_inclusive<Clause>(Clause::First_, Clause::Last_)) {
+    if (isPrivatizingClause(C))
+      Privatizing.insert(C);
+  }
+
+  for (auto D : llvm::enum_seq_inclusive<Directive>(Directive::First_,
+                                                    Directive::Last_)) {
+    bool AllowsPrivatizing = llvm::any_of(Privatizing, [&](Clause C) {
+      return isAllowedClauseForDirective(D, C, Version);
+    });
+    if (AllowsPrivatizing)
+      Constructs.insert(D);
+  }
+}
+
 namespace llvm::omp {
 ArrayRef<Directive> getLeafConstructs(Directive D) {
   auto Idx = static_cast<std::size_t>(D);
@@ -194,6 +216,18 @@ ArrayRef<unsigned> getOpenMPVersions() {
   return Versions;
 }
 
+bool isPrivatizingConstruct(Directive D, unsigned Version) {
+  static llvm::SmallSet<Directive, 16> Privatizing;
+  [[maybe_unused]] static bool Init =
+      (collectPrivatizingConstructs(Privatizing, Version), true);
+
+  // As of OpenMP 6.0, privatizing constructs (with the test being if they
+  // allow a privatizing clause) are: dispatch, distribute, do, for, loop,
+  // parallel, scope, sections, simd, single, target, target_data, task,
+  // taskgroup, taskloop, and teams.
+  return llvm::is_contained(Privatizing, D);
+}
+
 std::string prettifyFunctionName(StringRef FunctionName) {
   // Internalized functions have the right name, but simply a suffix.
   if (FunctionName.ends_with(".internalized"))
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 50ab206e2db8..3d5e487c8990 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -307,7 +307,19 @@ void llvm::spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New,
 
   // Move instructions to new block.
   BasicBlock *Old = IP.getBlock();
-  New->splice(New->begin(), Old, IP.getPoint(), Old->end());
+  // If the `Old` block is empty then there are no instructions to move. But in
+  // the new debug scheme, it could have trailing debug records which will be
+  // moved to `New` in `spliceDebugInfoEmptyBlock`. We dont want that for 2
+  // reasons:
+  // 1. If `New` is also empty, `BasicBlock::splice` crashes.
+  // 2. Even if `New` is not empty, the rationale to move those records to `New`
+  // (in `spliceDebugInfoEmptyBlock`) does not apply here. That function
+  // assumes that `Old` is optimized out and is going away. This is not the case
+  // here. The `Old` block is still being used e.g. a branch instruction is
+  // added to it later in this function.
+  // So we call `BasicBlock::splice` only when `Old` is not empty.
+  if (!Old->empty())
+    New->splice(New->begin(), Old, IP.getPoint(), Old->end());
 
   if (CreateBranch) {
     auto *NewBr = BranchInst::Create(New, Old);
@@ -903,6 +915,13 @@ Constant *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr,
                              ConstantInt::get(Int32, uint32_t(LocFlags)),
                              ConstantInt::get(Int32, Reserve2Flags),
                              ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
+
+    size_t SrcLocStrArgIdx = 4;
+    if (OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx)
+            ->getPointerAddressSpace() !=
+        IdentData[SrcLocStrArgIdx]->getType()->getPointerAddressSpace())
+      IdentData[SrcLocStrArgIdx] = ConstantExpr::getAddrSpaceCast(
+          SrcLocStr, OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx));
     Constant *Initializer =
         ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
 
@@ -943,8 +962,9 @@ Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef LocStr,
           GV.getInitializer() == Initializer)
         return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
 
-    SrcLocStr = Builder.CreateGlobalString(LocStr, /* Name */ "",
-                                           /* AddressSpace */ 0, &M);
+    SrcLocStr = Builder.CreateGlobalString(
+        LocStr, /*Name=*/"", M.getDataLayout().getDefaultGlobalsAddressSpace(),
+        &M);
   }
   return SrcLocStr;
 }
@@ -5581,13 +5601,13 @@ OpenMPIRBuilder::tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
   // Compute the trip counts of the floor loops.
   Builder.SetCurrentDebugLocation(DL);
   Builder.restoreIP(OutermostLoop->getPreheaderIP());
-  SmallVector<Value *, 4> FloorCount, FloorRems;
+  SmallVector<Value *, 4> FloorCompleteCount, FloorCount, FloorRems;
   for (int i = 0; i < NumLoops; ++i) {
     Value *TileSize = TileSizes[i];
     Value *OrigTripCount = OrigTripCounts[i];
     Type *IVType = OrigTripCount->getType();
 
-    Value *FloorTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
+    Value *FloorCompleteTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
     Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
 
     // 0 if tripcount divides the tilesize, 1 otherwise.
@@ -5601,11 +5621,12 @@ OpenMPIRBuilder::tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
         Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
 
     FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
-    FloorTripCount =
-        Builder.CreateAdd(FloorTripCount, FloorTripOverflow,
+    Value *FloorTripCount =
+        Builder.CreateAdd(FloorCompleteTripCount, FloorTripOverflow,
                           "omp_floor" + Twine(i) + ".tripcount", true);
 
     // Remember some values for later use.
+    FloorCompleteCount.push_back(FloorCompleteTripCount);
     FloorCount.push_back(FloorTripCount);
     FloorRems.push_back(FloorTripRem);
   }
@@ -5660,7 +5681,7 @@ OpenMPIRBuilder::tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
     Value *TileSize = TileSizes[i];
 
     Value *FloorIsEpilogue =
-        Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCount[i]);
+        Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCompleteCount[i]);
     Value *TileTripCount =
         Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
 
@@ -7369,9 +7390,8 @@ static void FixupDebugInfoForOutlinedFunction(
   // The location and scope of variable intrinsics and records still point to
   // the parent function of the target region. Update them.
   for (Instruction &I : instructions(Func)) {
-    if (auto *DDI = dyn_cast<llvm::DbgVariableIntrinsic>(&I))
-      UpdateDebugRecord(DDI);
-
+    assert(!isa<llvm::DbgVariableIntrinsic>(&I) &&
+           "Unexpected debug intrinsic");
     for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
       UpdateDebugRecord(&DVR);
   }
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index dc6d599fa958..094678f32af2 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -88,6 +88,8 @@
 
 using namespace llvm;
 
+// See https://llvm.org/docs/DebuggingLLVM.html for why these flags are useful.
+
 static cl::opt<bool>
     PrintInstAddrs("print-inst-addrs", cl::Hidden,
                    cl::desc("Print addresses of instructions when dumping"));
diff --git a/llvm/lib/IR/Assumptions.cpp b/llvm/lib/IR/Assumptions.cpp
index 6adbbc4a63b0..f8bbcb32231c 100644
--- a/llvm/lib/IR/Assumptions.cpp
+++ b/llvm/lib/IR/Assumptions.cpp
@@ -101,12 +101,16 @@ bool llvm::addAssumptions(CallBase &CB,
   return ::addAssumptionsImpl(CB, Assumptions);
 }
 
-StringSet<> llvm::KnownAssumptionStrings({
-    "omp_no_openmp",            // OpenMP 5.1
-    "omp_no_openmp_routines",   // OpenMP 5.1
-    "omp_no_parallelism",       // OpenMP 5.1
-    "omp_no_openmp_constructs", // OpenMP 6.0
-    "ompx_spmd_amenable",       // OpenMPOpt extension
-    "ompx_no_call_asm",         // OpenMPOpt extension
-    "ompx_aligned_barrier",     // OpenMPOpt extension
-});
+StringSet<> &llvm::getKnownAssumptionStrings() {
+  static StringSet<> Object({
+      "omp_no_openmp",            // OpenMP 5.1
+      "omp_no_openmp_routines",   // OpenMP 5.1
+      "omp_no_parallelism",       // OpenMP 5.1
+      "omp_no_openmp_constructs", // OpenMP 6.0
+      "ompx_spmd_amenable",       // OpenMPOpt extension
+      "ompx_no_call_asm",         // OpenMPOpt extension
+      "ompx_aligned_barrier",     // OpenMPOpt extension
+  });
+
+  return Object;
+}
diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp
index d1fbcb9e893a..4ac2ebd55dca 100644
--- a/llvm/lib/IR/Attributes.cpp
+++ b/llvm/lib/IR/Attributes.cpp
@@ -954,6 +954,19 @@ AttributeSet AttributeSet::addAttributes(LLVMContext &C,
   return get(C, B);
 }
 
+AttributeSet AttributeSet::addAttributes(LLVMContext &C,
+                                         const AttrBuilder &B) const {
+  if (!hasAttributes())
+    return get(C, B);
+
+  if (!B.hasAttributes())
+    return *this;
+
+  AttrBuilder Merged(C, *this);
+  Merged.merge(B);
+  return get(C, Merged);
+}
+
 AttributeSet AttributeSet::removeAttribute(LLVMContext &C,
                                              Attribute::AttrKind Kind) const {
   if (!hasAttribute(Kind)) return *this;
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index e200f3626e69..8d8120ac9ed9 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -48,6 +48,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/NVPTXAddrSpace.h"
 #include "llvm/Support/Regex.h"
+#include "llvm/Support/TimeProfiler.h"
 #include "llvm/TargetParser/Triple.h"
 #include <cstdint>
 #include <cstring>
@@ -106,6 +107,24 @@ static bool upgradeX86MaskedFPCompare(Function *F, Intrinsic::ID IID,
   return true;
 }
 
+// Upgrade the declaration of multiply and add bytes intrinsics whose input
+// arguments' types have changed from vectors of i32 to vectors of i8
+static bool upgradeX86MultiplyAddBytes(Function *F, Intrinsic::ID IID,
+                                       Function *&NewFn) {
+  // check if input argument type is a vector of i8
+  Type *Arg1Type = F->getFunctionType()->getParamType(1);
+  Type *Arg2Type = F->getFunctionType()->getParamType(2);
+  if (Arg1Type->isVectorTy() &&
+      cast<VectorType>(Arg1Type)->getElementType()->isIntegerTy(8) &&
+      Arg2Type->isVectorTy() &&
+      cast<VectorType>(Arg2Type)->getElementType()->isIntegerTy(8))
+    return false;
+
+  rename(F);
+  NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID);
+  return true;
+}
+
 static bool upgradeX86BF16Intrinsic(Function *F, Intrinsic::ID IID,
                                     Function *&NewFn) {
   if (F->getReturnType()->getScalarType()->isBFloatTy())
@@ -545,19 +564,34 @@ static bool upgradeX86IntrinsicFunction(Function *F, StringRef Name,
   if (ID != Intrinsic::not_intrinsic)
     return upgradeX86IntrinsicsWith8BitMask(F, ID, NewFn);
 
-  if (Name.consume_front("avx512.mask.cmp.")) {
-    // Added in 7.0
-    ID = StringSwitch<Intrinsic::ID>(Name)
-             .Case("pd.128", Intrinsic::x86_avx512_mask_cmp_pd_128)
-             .Case("pd.256", Intrinsic::x86_avx512_mask_cmp_pd_256)
-             .Case("pd.512", Intrinsic::x86_avx512_mask_cmp_pd_512)
-             .Case("ps.128", Intrinsic::x86_avx512_mask_cmp_ps_128)
-             .Case("ps.256", Intrinsic::x86_avx512_mask_cmp_ps_256)
-             .Case("ps.512", Intrinsic::x86_avx512_mask_cmp_ps_512)
-             .Default(Intrinsic::not_intrinsic);
-    if (ID != Intrinsic::not_intrinsic)
-      return upgradeX86MaskedFPCompare(F, ID, NewFn);
-    return false; // No other 'x86.avx523.mask.cmp.*'.
+  if (Name.consume_front("avx512.")) {
+    if (Name.consume_front("mask.cmp.")) {
+      // Added in 7.0
+      ID = StringSwitch<Intrinsic::ID>(Name)
+               .Case("pd.128", Intrinsic::x86_avx512_mask_cmp_pd_128)
+               .Case("pd.256", Intrinsic::x86_avx512_mask_cmp_pd_256)
+               .Case("pd.512", Intrinsic::x86_avx512_mask_cmp_pd_512)
+               .Case("ps.128", Intrinsic::x86_avx512_mask_cmp_ps_128)
+               .Case("ps.256", Intrinsic::x86_avx512_mask_cmp_ps_256)
+               .Case("ps.512", Intrinsic::x86_avx512_mask_cmp_ps_512)
+               .Default(Intrinsic::not_intrinsic);
+      if (ID != Intrinsic::not_intrinsic)
+        return upgradeX86MaskedFPCompare(F, ID, NewFn);
+    } else if (Name.starts_with("vpdpbusd.") ||
+               Name.starts_with("vpdpbusds.")) {
+      // Added in 21.1
+      ID = StringSwitch<Intrinsic::ID>(Name)
+               .Case("vpdpbusd.128", Intrinsic::x86_avx512_vpdpbusd_128)
+               .Case("vpdpbusd.256", Intrinsic::x86_avx512_vpdpbusd_256)
+               .Case("vpdpbusd.512", Intrinsic::x86_avx512_vpdpbusd_512)
+               .Case("vpdpbusds.128", Intrinsic::x86_avx512_vpdpbusds_128)
+               .Case("vpdpbusds.256", Intrinsic::x86_avx512_vpdpbusds_256)
+               .Case("vpdpbusds.512", Intrinsic::x86_avx512_vpdpbusds_512)
+               .Default(Intrinsic::not_intrinsic);
+      if (ID != Intrinsic::not_intrinsic)
+        return upgradeX86MultiplyAddBytes(F, ID, NewFn);
+    }
+    return false; // No other 'x86.avx512.*'.
   }
 
   if (Name.consume_front("avx512bf16.")) {
@@ -4148,6 +4182,32 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
 
     Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
                      CI->getArgOperand(2)};
+
+    // Input arguments types were incorrectly set to vectors of i32 before but
+    // they should be vectors of i8. Insert bit cast when encountering the old
+    // types
+    if (Args[1]->getType()->isVectorTy() &&
+        cast<VectorType>(Args[1]->getType())
+            ->getElementType()
+            ->isIntegerTy(32) &&
+        Args[2]->getType()->isVectorTy() &&
+        cast<VectorType>(Args[2]->getType())
+            ->getElementType()
+            ->isIntegerTy(32)) {
+      Type *NewArgType = nullptr;
+      if (VecWidth == 128)
+        NewArgType = VectorType::get(Builder.getInt8Ty(), 16, false);
+      else if (VecWidth == 256)
+        NewArgType = VectorType::get(Builder.getInt8Ty(), 32, false);
+      else if (VecWidth == 512)
+        NewArgType = VectorType::get(Builder.getInt8Ty(), 64, false);
+      else
+        llvm_unreachable("Unexpected vector bit width");
+
+      Args[1] = Builder.CreateBitCast(Args[1], NewArgType);
+      Args[2] = Builder.CreateBitCast(Args[2], NewArgType);
+    }
+
     Rep = Builder.CreateIntrinsic(IID, Args);
     Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
                                : CI->getArgOperand(0);
@@ -5155,6 +5215,23 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
     CI->eraseFromParent();
     return;
   }
+
+  case Intrinsic::x86_avx512_vpdpbusd_128:
+  case Intrinsic::x86_avx512_vpdpbusd_256:
+  case Intrinsic::x86_avx512_vpdpbusd_512:
+  case Intrinsic::x86_avx512_vpdpbusds_128:
+  case Intrinsic::x86_avx512_vpdpbusds_256:
+  case Intrinsic::x86_avx512_vpdpbusds_512: {
+    unsigned NumElts = CI->getType()->getPrimitiveSizeInBits() / 8;
+    Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
+                     CI->getArgOperand(2)};
+    Type *NewArgType = VectorType::get(Builder.getInt8Ty(), NumElts, false);
+    Args[1] = Builder.CreateBitCast(Args[1], NewArgType);
+    Args[2] = Builder.CreateBitCast(Args[2], NewArgType);
+
+    NewCall = Builder.CreateCall(NewFn, Args);
+    break;
+  }
   }
   assert(NewCall && "Should have either set this variable or returned through "
                     "the default case");
@@ -5256,6 +5333,7 @@ bool llvm::UpgradeDebugInfo(Module &M) {
   if (DisableAutoUpgradeDebugInfo)
     return false;
 
+  llvm::TimeTraceScope timeScope("Upgrade debug info");
   // We need to get metadata before the module is verified (i.e., getModuleFlag
   // makes assumptions that we haven't verified yet). Carefully extract the flag
   // from the metadata.
@@ -5381,6 +5459,16 @@ bool static upgradeSingleNVVMAnnotation(GlobalValue *GV, StringRef K,
     upgradeNVVMFnVectorAttr("nvvm.cluster_dim", K[0], GV, V);
     return true;
   }
+  if (K == "grid_constant") {
+    const auto Attr = Attribute::get(GV->getContext(), "nvvm.grid_constant");
+    for (const auto &Op : cast<MDNode>(V)->operands()) {
+      // For some reason, the index is 1-based in the metadata. Good thing we're
+      // able to auto-upgrade it!
+      const auto Index = mdconst::extract<ConstantInt>(Op)->getZExtValue() - 1;
+      cast<Function>(GV)->addParamAttr(Index, Attr);
+    }
+    return true;
+  }
 
   return false;
 }
diff --git a/llvm/lib/IR/DataLayout.cpp b/llvm/lib/IR/DataLayout.cpp
index dbd6d81ad2e2..ed629d4e5ea2 100644
--- a/llvm/lib/IR/DataLayout.cpp
+++ b/llvm/lib/IR/DataLayout.cpp
@@ -187,7 +187,6 @@ const char *DataLayout::getManglingComponent(const Triple &T) {
 // Default primitive type specifications.
 // NOTE: These arrays must be sorted by type bit width.
 constexpr DataLayout::PrimitiveSpec DefaultIntSpecs[] = {
-    {1, Align::Constant<1>(), Align::Constant<1>()},  // i1:8:8
     {8, Align::Constant<1>(), Align::Constant<1>()},  // i8:8:8
     {16, Align::Constant<2>(), Align::Constant<2>()}, // i16:16:16
     {32, Align::Constant<4>(), Align::Constant<4>()}, // i32:32:32
@@ -694,7 +693,12 @@ void DataLayout::setPointerSpec(uint32_t AddrSpace, uint32_t BitWidth,
 
 Align DataLayout::getIntegerAlignment(uint32_t BitWidth,
                                       bool abi_or_pref) const {
-  auto I = lower_bound(IntSpecs, BitWidth, LessPrimitiveBitWidth());
+  auto I = IntSpecs.begin();
+  for (; I != IntSpecs.end(); ++I) {
+    if (I->BitWidth >= BitWidth)
+      break;
+  }
+
   // If we don't have an exact match, use alignment of next larger integer
   // type. If there is none, use alignment of largest integer type by going
   // back one element.
@@ -839,6 +843,44 @@ Align DataLayout::getAlignment(Type *Ty, bool abi_or_pref) const {
   }
 }
 
+TypeSize DataLayout::getTypeAllocSize(Type *Ty) const {
+  switch (Ty->getTypeID()) {
+  case Type::ArrayTyID: {
+    // The alignment of the array is the alignment of the element, so there
+    // is no need for further adjustment.
+    auto *ATy = cast<ArrayType>(Ty);
+    return ATy->getNumElements() * getTypeAllocSize(ATy->getElementType());
+  }
+  case Type::StructTyID: {
+    const StructLayout *Layout = getStructLayout(cast<StructType>(Ty));
+    TypeSize Size = Layout->getSizeInBytes();
+
+    if (cast<StructType>(Ty)->isPacked())
+      return Size;
+
+    Align A = std::max(StructABIAlignment, Layout->getAlignment());
+    return alignTo(Size, A.value());
+  }
+  case Type::IntegerTyID: {
+    unsigned BitWidth = Ty->getIntegerBitWidth();
+    TypeSize Size = TypeSize::getFixed(divideCeil(BitWidth, 8));
+    Align A = getIntegerAlignment(BitWidth, /*ABI=*/true);
+    return alignTo(Size, A.value());
+  }
+  case Type::PointerTyID: {
+    unsigned AS = Ty->getPointerAddressSpace();
+    TypeSize Size = TypeSize::getFixed(getPointerSize(AS));
+    return alignTo(Size, getPointerABIAlignment(AS).value());
+  }
+  case Type::TargetExtTyID: {
+    Type *LayoutTy = cast<TargetExtType>(Ty)->getLayoutType();
+    return getTypeAllocSize(LayoutTy);
+  }
+  default:
+    return alignTo(getTypeStoreSize(Ty), getABITypeAlign(Ty).value());
+  }
+}
+
 Align DataLayout::getABITypeAlign(Type *Ty) const {
   return getAlignment(Ty, true);
 }
@@ -926,12 +968,13 @@ static APInt getElementIndex(TypeSize ElemSize, APInt &Offset) {
     return APInt::getZero(BitWidth);
   }
 
-  APInt Index = Offset.sdiv(ElemSize);
-  Offset -= Index * ElemSize;
+  uint64_t FixedElemSize = ElemSize.getFixedValue();
+  APInt Index = Offset.sdiv(FixedElemSize);
+  Offset -= Index * FixedElemSize;
   if (Offset.isNegative()) {
     // Prefer a positive remaining offset to allow struct indexing.
     --Index;
-    Offset += ElemSize;
+    Offset += FixedElemSize;
     assert(Offset.isNonNegative() && "Remaining offset shouldn't be negative");
   }
   return Index;
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index 8e523bcf7960..166521a27664 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -36,6 +36,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/TimeProfiler.h"
 #include <algorithm>
 #include <cassert>
 #include <optional>
@@ -563,6 +564,7 @@ bool llvm::stripDebugInfo(Function &F) {
 }
 
 bool llvm::StripDebugInfo(Module &M) {
+  llvm::TimeTraceScope timeScope("Strip debug info");
   bool Changed = false;
 
   for (NamedMDNode &NMD : llvm::make_early_inc_range(M.named_metadata())) {
@@ -755,7 +757,7 @@ private:
 
       return getReplacementMDNode(N);
     };
-    // Seperate recursive doRemap and operator [] into 2 lines to avoid
+    // Separate recursive doRemap and operator [] into 2 lines to avoid
     // out-of-order evaluations since both of them can access the same memory
     // location in map Replacements.
     auto Value = doRemap(N);
diff --git a/llvm/lib/IR/DebugLoc.cpp b/llvm/lib/IR/DebugLoc.cpp
index 79c5b896f8f2..01dafcab94ce 100644
--- a/llvm/lib/IR/DebugLoc.cpp
+++ b/llvm/lib/IR/DebugLoc.cpp
@@ -181,10 +181,19 @@ DebugLoc DebugLoc::getMergedLocations(ArrayRef<DebugLoc> Locs) {
   return Merged;
 }
 DebugLoc DebugLoc::getMergedLocation(DebugLoc LocA, DebugLoc LocB) {
-  if (!LocA)
-    return LocA;
-  if (!LocB)
+  if (!LocA || !LocB) {
+    // If coverage tracking is enabled, prioritize returning empty non-annotated
+    // locations to empty annotated locations.
+#if LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
+    if (!LocA && LocA.getKind() == DebugLocKind::Normal)
+      return LocA;
+    if (!LocB && LocB.getKind() == DebugLocKind::Normal)
+      return LocB;
+#endif // LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
+    if (!LocA)
+      return LocA;
     return LocB;
+  }
   return DILocation::getMergedLocation(LocA, LocB);
 }
 
diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp
index 70d364176062..30b5e48652b2 100644
--- a/llvm/lib/IR/Module.cpp
+++ b/llvm/lib/IR/Module.cpp
@@ -44,6 +44,7 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/RandomNumberGenerator.h"
+#include "llvm/Support/TimeProfiler.h"
 #include "llvm/Support/VersionTuple.h"
 #include <cassert>
 #include <cstdint>
@@ -478,6 +479,7 @@ Error Module::materializeAll() {
 }
 
 Error Module::materializeMetadata() {
+  llvm::TimeTraceScope timeScope("Materialize metadata");
   if (!Materializer)
     return Error::success();
   return Materializer->materializeMetadata();
diff --git a/llvm/lib/IR/ProfDataUtils.cpp b/llvm/lib/IR/ProfDataUtils.cpp
index d24263f8b3bd..b41256f59909 100644
--- a/llvm/lib/IR/ProfDataUtils.cpp
+++ b/llvm/lib/IR/ProfDataUtils.cpp
@@ -250,7 +250,15 @@ void setExplicitlyUnknownBranchWeights(Instruction &I) {
                   MDB.createString(MDProfLabels::UnknownBranchWeightsMarker)));
 }
 
-bool isExplicitlyUnknownBranchWeightsMetadata(const MDNode &MD) {
+void setExplicitlyUnknownFunctionEntryCount(Function &F) {
+  MDBuilder MDB(F.getContext());
+  F.setMetadata(
+      LLVMContext::MD_prof,
+      MDNode::get(F.getContext(),
+                  MDB.createString(MDProfLabels::UnknownBranchWeightsMarker)));
+}
+
+bool isExplicitlyUnknownProfileMetadata(const MDNode &MD) {
   if (MD.getNumOperands() != 1)
     return false;
   return MD.getOperand(0).equalsStr(MDProfLabels::UnknownBranchWeightsMarker);
@@ -260,7 +268,7 @@ bool hasExplicitlyUnknownBranchWeights(const Instruction &I) {
   auto *MD = I.getMetadata(LLVMContext::MD_prof);
   if (!MD)
     return false;
-  return isExplicitlyUnknownBranchWeightsMetadata(*MD);
+  return isExplicitlyUnknownProfileMetadata(*MD);
 }
 
 void setBranchWeights(Instruction &I, ArrayRef<uint32_t> Weights,
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index 3c324f2fe0d8..4fe5714a74e3 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -40,13 +40,19 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT,
     // hard-float calling convention by default.
     if (!TT.isWatchABI()) {
       if (isAAPCS_ABI(TT, ABIName)) {
-        setLibcallImplCallingConv(RTLIB::__truncsfhf2, CallingConv::ARM_AAPCS);
-        setLibcallImplCallingConv(RTLIB::__truncdfhf2, CallingConv::ARM_AAPCS);
-        setLibcallImplCallingConv(RTLIB::__extendhfsf2, CallingConv::ARM_AAPCS);
+        setLibcallImplCallingConv(RTLIB::impl___truncsfhf2,
+                                  CallingConv::ARM_AAPCS);
+        setLibcallImplCallingConv(RTLIB::impl___truncdfhf2,
+                                  CallingConv::ARM_AAPCS);
+        setLibcallImplCallingConv(RTLIB::impl___extendhfsf2,
+                                  CallingConv::ARM_AAPCS);
       } else {
-        setLibcallImplCallingConv(RTLIB::__truncsfhf2, CallingConv::ARM_APCS);
-        setLibcallImplCallingConv(RTLIB::__truncdfhf2, CallingConv::ARM_APCS);
-        setLibcallImplCallingConv(RTLIB::__extendhfsf2, CallingConv::ARM_APCS);
+        setLibcallImplCallingConv(RTLIB::impl___truncsfhf2,
+                                  CallingConv::ARM_APCS);
+        setLibcallImplCallingConv(RTLIB::impl___truncdfhf2,
+                                  CallingConv::ARM_APCS);
+        setLibcallImplCallingConv(RTLIB::impl___extendhfsf2,
+                                  CallingConv::ARM_APCS);
       }
     }
 
diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp
index 9c3466234035..9db48e8f6a96 100644
--- a/llvm/lib/IR/Type.cpp
+++ b/llvm/lib/IR/Type.cpp
@@ -1036,7 +1036,8 @@ static TargetTypeInfo getTargetTypeInfo(const TargetExtType *Ty) {
   // DirectX resources
   if (Name.starts_with("dx."))
     return TargetTypeInfo(PointerType::get(C, 0), TargetExtType::CanBeGlobal,
-                          TargetExtType::CanBeLocal);
+                          TargetExtType::CanBeLocal,
+                          TargetExtType::IsTokenLike);
 
   // Opaque types in the AMDGPU name space.
   if (Name == "amdgcn.named.barrier") {
@@ -1054,6 +1055,14 @@ static TargetTypeInfo getTargetTypeInfo(const TargetExtType *Ty) {
   return TargetTypeInfo(Type::getVoidTy(C));
 }
 
+bool Type::isTokenLikeTy() const {
+  if (isTokenTy())
+    return true;
+  if (auto *TT = dyn_cast<TargetExtType>(this))
+    return TT->hasProperty(TargetExtType::Property::IsTokenLike);
+  return false;
+}
+
 Type *TargetExtType::getLayoutType() const {
   return getTargetTypeInfo(this).LayoutType;
 }
diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp
index 5928c89029b8..4e8f359481b8 100644
--- a/llvm/lib/IR/Value.cpp
+++ b/llvm/lib/IR/Value.cpp
@@ -836,6 +836,9 @@ bool Value::canBeFreed() const {
       return false;
   }
 
+  if (isa<IntToPtrInst>(this) && getMetadata(LLVMContext::MD_nofree))
+    return false;
+
   const Function *F = nullptr;
   if (auto *I = dyn_cast<Instruction>(this))
     F = I->getFunction();
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 4eb4b58d022e..81a53722f489 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -119,6 +119,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/ModRef.h"
+#include "llvm/Support/TimeProfiler.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
@@ -399,6 +400,7 @@ public:
   bool hasBrokenDebugInfo() const { return BrokenDebugInfo; }
 
   bool verify(const Function &F) {
+    llvm::TimeTraceScope timeScope("Verifier");
     assert(F.getParent() == &M &&
            "An instance of this class only works with a specific module!");
 
@@ -526,6 +528,7 @@ private:
   void visitRangeMetadata(Instruction &I, MDNode *Range, Type *Ty);
   void visitNoaliasAddrspaceMetadata(Instruction &I, MDNode *Range, Type *Ty);
   void visitDereferenceableMetadata(Instruction &I, MDNode *MD);
+  void visitNofreeMetadata(Instruction &I, MDNode *MD);
   void visitProfMetadata(Instruction &I, MDNode *MD);
   void visitCallStackMetadata(MDNode *MD);
   void visitMemProfMetadata(Instruction &I, MDNode *MD);
@@ -1298,9 +1301,11 @@ void Verifier::visitDIDerivedType(const DIDerivedType &N) {
   if (N.getTag() == dwarf::DW_TAG_set_type) {
     if (auto *T = N.getRawBaseType()) {
       auto *Enum = dyn_cast_or_null<DICompositeType>(T);
+      auto *Subrange = dyn_cast_or_null<DISubrangeType>(T);
       auto *Basic = dyn_cast_or_null<DIBasicType>(T);
       CheckDI(
           (Enum && Enum->getTag() == dwarf::DW_TAG_enumeration_type) ||
+              (Subrange && Subrange->getTag() == dwarf::DW_TAG_subrange_type) ||
               (Basic && (Basic->getEncoding() == dwarf::DW_ATE_unsigned ||
                          Basic->getEncoding() == dwarf::DW_ATE_signed ||
                          Basic->getEncoding() == dwarf::DW_ATE_unsigned_char ||
@@ -2443,16 +2448,6 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
       CheckFailed("invalid value for 'frame-pointer' attribute: " + FP, V);
   }
 
-  // Check EVEX512 feature.
-  if (TT.isX86() && MaxParameterWidth >= 512) {
-    Attribute TargetFeaturesAttr = Attrs.getFnAttr("target-features");
-    if (TargetFeaturesAttr.isValid()) {
-      StringRef TF = TargetFeaturesAttr.getValueAsString();
-      Check(!TF.contains("+avx512f") || !TF.contains("-evex512"),
-            "512-bit vector arguments require 'evex512' for AVX512", V);
-    }
-  }
-
   checkUnsignedBaseTenFuncAttr(Attrs, "patchable-function-prefix", V);
   checkUnsignedBaseTenFuncAttr(Attrs, "patchable-function-entry", V);
   if (Attrs.hasFnAttr("patchable-function-entry-section"))
@@ -2526,12 +2521,11 @@ void Verifier::verifyFunctionMetadata(
   for (const auto &Pair : MDs) {
     if (Pair.first == LLVMContext::MD_prof) {
       MDNode *MD = Pair.second;
-      if (isExplicitlyUnknownBranchWeightsMetadata(*MD)) {
-        CheckFailed("'unknown' !prof metadata should appear only on "
-                    "instructions supporting the 'branch_weights' metadata",
-                    MD);
+      // We may have functions that are synthesized by the compiler, e.g. in
+      // WPD, that we can't currently determine the entry count.
+      if (isExplicitlyUnknownProfileMetadata(*MD))
         continue;
-      }
+
       Check(MD->getNumOperands() >= 2,
             "!prof annotations should have no less than 2 operands", MD);
 
@@ -2830,6 +2824,7 @@ static Instruction *getSuccPad(Instruction *Terminator) {
 }
 
 void Verifier::verifySiblingFuncletUnwinds() {
+  llvm::TimeTraceScope timeScope("Verifier verify sibling funclet unwinds");
   SmallPtrSet<Instruction *, 8> Visited;
   SmallPtrSet<Instruction *, 8> Active;
   for (const auto &Pair : SiblingFuncletInfo) {
@@ -3006,7 +3001,7 @@ void Verifier::visitFunction(const Function &F) {
     if (!IsIntrinsic) {
       Check(!Arg.getType()->isMetadataTy(),
             "Function takes metadata but isn't an intrinsic", &Arg, &F);
-      Check(!Arg.getType()->isTokenTy(),
+      Check(!Arg.getType()->isTokenLikeTy(),
             "Function takes token but isn't an intrinsic", &Arg, &F);
       Check(!Arg.getType()->isX86_AMXTy(),
             "Function takes x86_amx but isn't an intrinsic", &Arg, &F);
@@ -3020,7 +3015,7 @@ void Verifier::visitFunction(const Function &F) {
   }
 
   if (!IsIntrinsic) {
-    Check(!F.getReturnType()->isTokenTy(),
+    Check(!F.getReturnType()->isTokenLikeTy(),
           "Function returns a token but isn't an intrinsic", &F);
     Check(!F.getReturnType()->isX86_AMXTy(),
           "Function returns a x86_amx but isn't an intrinsic", &F);
@@ -3190,7 +3185,7 @@ void Verifier::visitFunction(const Function &F) {
 
     // Scope and SP could be the same MDNode and we don't want to skip
     // validation in that case
-    if (SP && ((Scope != SP) && !Seen.insert(SP).second))
+    if ((Scope != SP) && !Seen.insert(SP).second)
       return;
 
     CheckDI(SP->describes(&F),
@@ -3634,7 +3629,7 @@ void Verifier::visitPHINode(PHINode &PN) {
         "PHI nodes not grouped at top of basic block!", &PN, PN.getParent());
 
   // Check that a PHI doesn't yield a Token.
-  Check(!PN.getType()->isTokenTy(), "PHI nodes cannot have token type!");
+  Check(!PN.getType()->isTokenLikeTy(), "PHI nodes cannot have token type!");
 
   // Check that all of the values of the PHI node have the same type as the
   // result.
@@ -3839,14 +3834,14 @@ void Verifier::visitCallBase(CallBase &Call) {
     for (Type *ParamTy : FTy->params()) {
       Check(!ParamTy->isMetadataTy(),
             "Function has metadata parameter but isn't an intrinsic", Call);
-      Check(!ParamTy->isTokenTy(),
+      Check(!ParamTy->isTokenLikeTy(),
             "Function has token parameter but isn't an intrinsic", Call);
     }
   }
 
   // Verify that indirect calls don't return tokens.
   if (!Call.getCalledFunction()) {
-    Check(!FTy->getReturnType()->isTokenTy(),
+    Check(!FTy->getReturnType()->isTokenLikeTy(),
           "Return type cannot be token for indirect call!");
     Check(!FTy->getReturnType()->isX86_AMXTy(),
           "Return type cannot be x86_amx for indirect call!");
@@ -5021,6 +5016,13 @@ void Verifier::visitDereferenceableMetadata(Instruction& I, MDNode* MD) {
         &I);
 }
 
+void Verifier::visitNofreeMetadata(Instruction &I, MDNode *MD) {
+  Check(I.getType()->isPointerTy(), "nofree applies only to pointer types", &I);
+  Check((isa<IntToPtrInst>(I)), "nofree applies only to inttoptr instruction",
+        &I);
+  Check(MD->getNumOperands() == 0, "nofree metadata must be empty", &I);
+}
+
 void Verifier::visitProfMetadata(Instruction &I, MDNode *MD) {
   auto GetBranchingTerminatorNumOperands = [&]() {
     unsigned ExpectedNumOperands = 0;
@@ -5496,6 +5498,9 @@ void Verifier::visitInstruction(Instruction &I) {
   if (MDNode *MD = I.getMetadata(LLVMContext::MD_dereferenceable_or_null))
     visitDereferenceableMetadata(I, MD);
 
+  if (MDNode *MD = I.getMetadata(LLVMContext::MD_nofree))
+    visitNofreeMetadata(I, MD);
+
   if (MDNode *TBAA = I.getMetadata(LLVMContext::MD_tbaa))
     TBAAVerifyHelper.visitTBAAMetadata(I, TBAA);
 
@@ -6724,7 +6729,9 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
           "invalid vector type for format", &Call, Src1, Call.getArgOperand(5));
     break;
   }
-  case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4: {
+  case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4:
+  case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
+  case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4: {
     Value *Src0 = Call.getArgOperand(1);
     Value *Src1 = Call.getArgOperand(3);
 
@@ -6772,6 +6779,28 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
           "invalid vector type for format", &Call, Src1, Call.getArgOperand(2));
     break;
   }
+  case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
+  case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
+  case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
+  case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
+  case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
+  case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
+    // Check we only use this intrinsic on the FLAT or GLOBAL address spaces.
+    Value *PtrArg = Call.getArgOperand(0);
+    const unsigned AS = PtrArg->getType()->getPointerAddressSpace();
+    Check(AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::GLOBAL_ADDRESS,
+          "cooperative atomic intrinsics require a generic or global pointer",
+          &Call, PtrArg);
+
+    // Last argument must be a MD string
+    auto *Op = cast<MetadataAsValue>(Call.getArgOperand(Call.arg_size() - 1));
+    MDNode *MD = cast<MDNode>(Op->getMetadata());
+    Check((MD->getNumOperands() == 1) && isa<MDString>(MD->getOperand(0)),
+          "cooperative atomic intrinsics require that the last argument is a "
+          "metadata string",
+          &Call, Op);
+    break;
+  }
   case Intrinsic::nvvm_setmaxnreg_inc_sync_aligned_u32:
   case Intrinsic::nvvm_setmaxnreg_dec_sync_aligned_u32: {
     Value *V = Call.getArgOperand(0);
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 35d24c17bbd9..ce9ecc35e192 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -631,6 +631,7 @@ LTO::~LTO() = default;
 void LTO::addModuleToGlobalRes(ArrayRef<InputFile::Symbol> Syms,
                                ArrayRef<SymbolResolution> Res,
                                unsigned Partition, bool InSummary) {
+  llvm::TimeTraceScope timeScope("LTO add module to global resolution");
   auto *ResI = Res.begin();
   auto *ResE = Res.end();
   (void)ResE;
@@ -731,6 +732,7 @@ static void writeToResolutionFile(raw_ostream &OS, InputFile *Input,
 
 Error LTO::add(std::unique_ptr<InputFile> Input,
                ArrayRef<SymbolResolution> Res) {
+  llvm::TimeTraceScope timeScope("LTO add input", Input->getName());
   assert(!CalledGetMaxTasks);
 
   if (Conf.ResolutionFile)
@@ -756,6 +758,7 @@ Error LTO::add(std::unique_ptr<InputFile> Input,
 Expected<ArrayRef<SymbolResolution>>
 LTO::addModule(InputFile &Input, ArrayRef<SymbolResolution> InputRes,
                unsigned ModI, ArrayRef<SymbolResolution> Res) {
+  llvm::TimeTraceScope timeScope("LTO add module", Input.getName());
   Expected<BitcodeLTOInfo> LTOInfo = Input.Mods[ModI].getLTOInfo();
   if (!LTOInfo)
     return LTOInfo.takeError();
@@ -850,6 +853,7 @@ Expected<
 LTO::addRegularLTO(InputFile &Input, ArrayRef<SymbolResolution> InputRes,
                    BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
                    ArrayRef<SymbolResolution> Res) {
+  llvm::TimeTraceScope timeScope("LTO add regular LTO");
   RegularLTOState::AddedModule Mod;
   Expected<std::unique_ptr<Module>> MOrErr =
       BM.getLazyModule(RegularLTO.Ctx, /*ShouldLazyLoadMetadata*/ true,
@@ -1024,6 +1028,7 @@ LTO::addRegularLTO(InputFile &Input, ArrayRef<SymbolResolution> InputRes,
 
 Error LTO::linkRegularLTO(RegularLTOState::AddedModule Mod,
                           bool LivenessFromIndex) {
+  llvm::TimeTraceScope timeScope("LTO link regular LTO");
   std::vector<GlobalValue *> Keep;
   for (GlobalValue *GV : Mod.Keep) {
     if (LivenessFromIndex && !ThinLTO.CombinedIndex.isGUIDLive(GV->getGUID())) {
@@ -1063,6 +1068,7 @@ Error LTO::linkRegularLTO(RegularLTOState::AddedModule Mod,
 Expected<ArrayRef<SymbolResolution>>
 LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
                 ArrayRef<SymbolResolution> Res) {
+  llvm::TimeTraceScope timeScope("LTO add thin LTO");
   ArrayRef<SymbolResolution> ResTmp = Res;
   for (const InputFile::Symbol &Sym : Syms) {
     assert(!ResTmp.empty());
@@ -1252,6 +1258,7 @@ Error LTO::run(AddStreamFn AddStream, FileCache Cache) {
 
 void lto::updateMemProfAttributes(Module &Mod,
                                   const ModuleSummaryIndex &Index) {
+  llvm::TimeTraceScope timeScope("LTO update memprof attributes");
   if (Index.withSupportsHotColdNew())
     return;
 
@@ -1282,6 +1289,7 @@ void lto::updateMemProfAttributes(Module &Mod,
 }
 
 Error LTO::runRegularLTO(AddStreamFn AddStream) {
+  llvm::TimeTraceScope timeScope("Run regular LTO");
   // Setup optimization remarks.
   auto DiagFileOrErr = lto::setupLLVMOptimizationRemarks(
       RegularLTO.CombinedModule->getContext(), Conf.RemarksFilename,
@@ -1294,10 +1302,12 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) {
 
   // Finalize linking of regular LTO modules containing summaries now that
   // we have computed liveness information.
-  for (auto &M : RegularLTO.ModsWithSummaries)
-    if (Error Err = linkRegularLTO(std::move(M),
-                                   /*LivenessFromIndex=*/true))
-      return Err;
+  {
+    llvm::TimeTraceScope timeScope("Link regular LTO");
+    for (auto &M : RegularLTO.ModsWithSummaries)
+      if (Error Err = linkRegularLTO(std::move(M), /*LivenessFromIndex=*/true))
+        return Err;
+  }
 
   // Ensure we don't have inconsistently split LTO units with type tests.
   // FIXME: this checks both LTO and ThinLTO. It happens to work as we take
@@ -1526,6 +1536,9 @@ public:
       const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
       const GVSummaryMapTy &DefinedGlobals,
       MapVector<StringRef, BitcodeModule> &ModuleMap) {
+    auto ModuleID = BM.getModuleIdentifier();
+    llvm::TimeTraceScope timeScope("Run ThinLTO backend thread (in-process)",
+                                   ModuleID);
     auto RunThinBackend = [&](AddStreamFn AddStream) {
       LTOLLVMContext BackendContext(Conf);
       Expected<std::unique_ptr<Module>> MOrErr = BM.parseModule(BackendContext);
@@ -1536,9 +1549,6 @@ public:
                          ImportList, DefinedGlobals, &ModuleMap,
                          Conf.CodeGenOnly);
     };
-
-    auto ModuleID = BM.getModuleIdentifier();
-
     if (ShouldEmitIndexFiles) {
       if (auto E = emitFiles(ImportList, ModuleID, ModuleID.str()))
         return E;
@@ -1639,6 +1649,9 @@ public:
       const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
       const GVSummaryMapTy &DefinedGlobals,
       MapVector<StringRef, BitcodeModule> &ModuleMap) override {
+    auto ModuleID = BM.getModuleIdentifier();
+    llvm::TimeTraceScope timeScope("Run ThinLTO backend thread (first round)",
+                                   ModuleID);
     auto RunThinBackend = [&](AddStreamFn CGAddStream,
                               AddStreamFn IRAddStream) {
       LTOLLVMContext BackendContext(Conf);
@@ -1650,8 +1663,6 @@ public:
                          ImportList, DefinedGlobals, &ModuleMap,
                          Conf.CodeGenOnly, IRAddStream);
     };
-
-    auto ModuleID = BM.getModuleIdentifier();
     // Like InProcessThinBackend, we produce index files as needed for
     // FirstRoundThinBackend. However, these files are not generated for
     // SecondRoundThinBackend.
@@ -1735,6 +1746,9 @@ public:
       const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
       const GVSummaryMapTy &DefinedGlobals,
       MapVector<StringRef, BitcodeModule> &ModuleMap) override {
+    auto ModuleID = BM.getModuleIdentifier();
+    llvm::TimeTraceScope timeScope("Run ThinLTO backend thread (second round)",
+                                   ModuleID);
     auto RunThinBackend = [&](AddStreamFn AddStream) {
       LTOLLVMContext BackendContext(Conf);
       std::unique_ptr<Module> LoadedModule =
@@ -1744,8 +1758,6 @@ public:
                          ImportList, DefinedGlobals, &ModuleMap,
                          /*CodeGenOnly=*/true);
     };
-
-    auto ModuleID = BM.getModuleIdentifier();
     if (!Cache.isValid() || !CombinedIndex.modulePaths().count(ModuleID) ||
         all_of(CombinedIndex.getModuleHash(ModuleID),
                [](uint32_t V) { return V == 0; }))
@@ -1915,6 +1927,7 @@ ThinBackend lto::createWriteIndexesThinBackend(
 
 Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
                       const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols) {
+  llvm::TimeTraceScope timeScope("Run ThinLTO");
   LLVM_DEBUG(dbgs() << "Running ThinLTO\n");
   ThinLTO.CombinedIndex.releaseTemporaryMemory();
   timeTraceProfilerBegin("ThinLink", StringRef(""));
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index 5e8cd12fe040..ce42fc526bea 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -366,6 +366,7 @@ bool lto::opt(const Config &Conf, TargetMachine *TM, unsigned Task, Module &Mod,
               bool IsThinLTO, ModuleSummaryIndex *ExportSummary,
               const ModuleSummaryIndex *ImportSummary,
               const std::vector<uint8_t> &CmdArgs) {
+  llvm::TimeTraceScope timeScope("opt");
   if (EmbedBitcode == LTOBitcodeEmbedding::EmbedPostMergePreOptimized) {
     // FIXME: the motivation for capturing post-merge bitcode and command line
     // is replicating the compilation environment from bitcode, without needing
@@ -399,6 +400,7 @@ bool lto::opt(const Config &Conf, TargetMachine *TM, unsigned Task, Module &Mod,
 static void codegen(const Config &Conf, TargetMachine *TM,
                     AddStreamFn AddStream, unsigned Task, Module &Mod,
                     const ModuleSummaryIndex &CombinedIndex) {
+  llvm::TimeTraceScope timeScope("codegen");
   if (Conf.PreCodeGenModuleHook && !Conf.PreCodeGenModuleHook(Task, Mod))
     return;
 
@@ -552,6 +554,7 @@ Error lto::finalizeOptimizationRemarks(
 Error lto::backend(const Config &C, AddStreamFn AddStream,
                    unsigned ParallelCodeGenParallelismLevel, Module &Mod,
                    ModuleSummaryIndex &CombinedIndex) {
+  llvm::TimeTraceScope timeScope("LTO backend");
   Expected<const Target *> TOrErr = initAndLookupTarget(C, Mod);
   if (!TOrErr)
     return TOrErr.takeError();
@@ -577,6 +580,7 @@ Error lto::backend(const Config &C, AddStreamFn AddStream,
 
 static void dropDeadSymbols(Module &Mod, const GVSummaryMapTy &DefinedGlobals,
                             const ModuleSummaryIndex &Index) {
+  llvm::TimeTraceScope timeScope("Drop dead symbols");
   std::vector<GlobalValue*> DeadGVs;
   for (auto &GV : Mod.global_values())
     if (GlobalValueSummary *GVS = DefinedGlobals.lookup(GV.getGUID()))
@@ -603,6 +607,7 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
                        MapVector<StringRef, BitcodeModule> *ModuleMap,
                        bool CodeGenOnly, AddStreamFn IRAddStream,
                        const std::vector<uint8_t> &CmdArgs) {
+  llvm::TimeTraceScope timeScope("Thin backend", Mod.getModuleIdentifier());
   Expected<const Target *> TOrErr = initAndLookupTarget(Conf, Mod);
   if (!TOrErr)
     return TOrErr.takeError();
@@ -679,6 +684,7 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
     return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
 
   auto ModuleLoader = [&](StringRef Identifier) {
+    llvm::TimeTraceScope moduleLoaderScope("Module loader", Identifier);
     assert(Mod.getContext().isODRUniquingDebugTypes() &&
            "ODR Type uniquing should be enabled on the context");
     if (ModuleMap) {
@@ -712,10 +718,13 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
     return MOrErr;
   };
 
-  FunctionImporter Importer(CombinedIndex, ModuleLoader,
-                            ClearDSOLocalOnDeclarations);
-  if (Error Err = Importer.importFunctions(Mod, ImportList).takeError())
-    return Err;
+  {
+    llvm::TimeTraceScope importScope("Import functions");
+    FunctionImporter Importer(CombinedIndex, ModuleLoader,
+                              ClearDSOLocalOnDeclarations);
+    if (Error Err = Importer.importFunctions(Mod, ImportList).takeError())
+      return Err;
+  }
 
   // Do this after any importing so that imported code is updated.
   updateMemProfAttributes(Mod, CombinedIndex);
diff --git a/llvm/lib/LTO/LTOCodeGenerator.cpp b/llvm/lib/LTO/LTOCodeGenerator.cpp
index 09b91d81225a..cdeab98ff6c9 100644
--- a/llvm/lib/LTO/LTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/LTOCodeGenerator.cpp
@@ -384,7 +384,7 @@ bool LTOCodeGenerator::determineTarget() {
 
   // create target machine from info for merged modules
   std::string ErrMsg;
-  MArch = TargetRegistry::lookupTarget(TripleStr, ErrMsg);
+  MArch = TargetRegistry::lookupTarget(Triple, ErrMsg);
   if (!MArch) {
     emitError(ErrMsg);
     return false;
diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp
index d6c15de4c4cd..1bff6cd25156 100644
--- a/llvm/lib/Linker/IRMover.cpp
+++ b/llvm/lib/Linker/IRMover.cpp
@@ -8,6 +8,8 @@
 
 #include "llvm/Linker/IRMover.h"
 #include "LinkDiagnosticInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/IR/AutoUpgrade.h"
@@ -290,6 +292,9 @@ class IRLinker {
   Module &DstM;
   std::unique_ptr<Module> SrcM;
 
+  // Lookup table to optimize IRMover::linkNamedMDNodes().
+  IRMover::NamedMDNodesT &NamedMDNodes;
+
   /// See IRMover::move().
   IRMover::LazyCallback AddLazyFor;
 
@@ -435,10 +440,12 @@ public:
   IRLinker(Module &DstM, MDMapT &SharedMDs,
            IRMover::IdentifiedStructTypeSet &Set, std::unique_ptr<Module> SrcM,
            ArrayRef<GlobalValue *> ValuesToLink,
-           IRMover::LazyCallback AddLazyFor, bool IsPerformingImport)
-      : DstM(DstM), SrcM(std::move(SrcM)), AddLazyFor(std::move(AddLazyFor)),
-        TypeMap(Set), GValMaterializer(*this), LValMaterializer(*this),
-        SharedMDs(SharedMDs), IsPerformingImport(IsPerformingImport),
+           IRMover::LazyCallback AddLazyFor, bool IsPerformingImport,
+           IRMover::NamedMDNodesT &NamedMDNodes)
+      : DstM(DstM), SrcM(std::move(SrcM)), NamedMDNodes(NamedMDNodes),
+        AddLazyFor(std::move(AddLazyFor)), TypeMap(Set),
+        GValMaterializer(*this), LValMaterializer(*this), SharedMDs(SharedMDs),
+        IsPerformingImport(IsPerformingImport),
         Mapper(ValueMap, RF_ReuseAndMutateDistinctMDs | RF_IgnoreMissingLocals,
                &TypeMap, &GValMaterializer),
         IndirectSymbolMCID(Mapper.registerAlternateMappingContext(
@@ -1132,10 +1139,17 @@ void IRLinker::linkNamedMDNodes() {
       continue;
 
     NamedMDNode *DestNMD = DstM.getOrInsertNamedMetadata(NMD.getName());
+
+    auto &Inserted = NamedMDNodes[DestNMD];
+    if (Inserted.empty()) {
+      // Must be the first module, copy everything from DestNMD.
+      Inserted.insert(DestNMD->operands().begin(), DestNMD->operands().end());
+    }
+
     // Add Src elements into Dest node.
     for (const MDNode *Op : NMD.operands()) {
       MDNode *MD = Mapper.mapMDNode(*Op);
-      if (!is_contained(DestNMD->operands(), MD))
+      if (Inserted.insert(MD).second)
         DestNMD->addOperand(MD);
     }
   }
@@ -1670,6 +1684,6 @@ Error IRMover::move(std::unique_ptr<Module> Src,
                     LazyCallback AddLazyFor, bool IsPerformingImport) {
   IRLinker TheIRLinker(Composite, SharedMDs, IdentifiedStructTypes,
                        std::move(Src), ValuesToLink, std::move(AddLazyFor),
-                       IsPerformingImport);
+                       IsPerformingImport, NamedMDNodes);
   return TheIRLinker.run();
 }
diff --git a/llvm/lib/MC/DXContainerRootSignature.cpp b/llvm/lib/MC/DXContainerRootSignature.cpp
index 482280b5ef28..b9ebb7a9e789 100644
--- a/llvm/lib/MC/DXContainerRootSignature.cpp
+++ b/llvm/lib/MC/DXContainerRootSignature.cpp
@@ -20,49 +20,62 @@ static uint32_t writePlaceholder(raw_svector_ostream &Stream) {
   return Offset;
 }
 
-static void rewriteOffsetToCurrentByte(raw_svector_ostream &Stream,
-                                       uint32_t Offset) {
+static uint32_t rewriteOffsetToCurrentByte(raw_svector_ostream &Stream,
+                                           uint32_t Offset) {
+  uint32_t ByteOffset = Stream.tell();
   uint32_t Value =
       support::endian::byte_swap<uint32_t, llvm::endianness::little>(
-          Stream.tell());
+          ByteOffset);
   Stream.pwrite(reinterpret_cast<const char *>(&Value), sizeof(Value), Offset);
+  return ByteOffset;
 }
 
 size_t RootSignatureDesc::getSize() const {
-  size_t Size =
-      sizeof(dxbc::RTS0::v1::RootSignatureHeader) +
-      ParametersContainer.size() * sizeof(dxbc::RTS0::v1::RootParameterHeader) +
+  uint32_t StaticSamplersOffset = computeStaticSamplersOffset();
+  size_t StaticSamplersSize =
       StaticSamplers.size() * sizeof(dxbc::RTS0::v1::StaticSampler);
 
+  return size_t(StaticSamplersOffset) + StaticSamplersSize;
+}
+
+uint32_t RootSignatureDesc::computeRootParametersOffset() const {
+  return sizeof(dxbc::RTS0::v1::RootSignatureHeader);
+}
+
+uint32_t RootSignatureDesc::computeStaticSamplersOffset() const {
+  uint32_t Offset = computeRootParametersOffset();
+
   for (const RootParameterInfo &I : ParametersContainer) {
-    switch (I.Header.ParameterType) {
-    case llvm::to_underlying(dxbc::RootParameterType::Constants32Bit):
-      Size += sizeof(dxbc::RTS0::v1::RootConstants);
+    Offset += sizeof(dxbc::RTS0::v1::RootParameterHeader);
+    switch (I.Type) {
+    case dxbc::RootParameterType::Constants32Bit:
+      Offset += sizeof(dxbc::RTS0::v1::RootConstants);
       break;
-    case llvm::to_underlying(dxbc::RootParameterType::CBV):
-    case llvm::to_underlying(dxbc::RootParameterType::SRV):
-    case llvm::to_underlying(dxbc::RootParameterType::UAV):
+    case dxbc::RootParameterType::CBV:
+    case dxbc::RootParameterType::SRV:
+    case dxbc::RootParameterType::UAV:
       if (Version == 1)
-        Size += sizeof(dxbc::RTS0::v1::RootDescriptor);
+        Offset += sizeof(dxbc::RTS0::v1::RootDescriptor);
       else
-        Size += sizeof(dxbc::RTS0::v2::RootDescriptor);
+        Offset += sizeof(dxbc::RTS0::v2::RootDescriptor);
 
       break;
-    case llvm::to_underlying(dxbc::RootParameterType::DescriptorTable):
+    case dxbc::RootParameterType::DescriptorTable:
       const DescriptorTable &Table =
           ParametersContainer.getDescriptorTable(I.Location);
 
       // 4 bytes for the number of ranges in table and
       // 4 bytes for the ranges offset
-      Size += 2 * sizeof(uint32_t);
+      Offset += 2 * sizeof(uint32_t);
       if (Version == 1)
-        Size += sizeof(dxbc::RTS0::v1::DescriptorRange) * Table.Ranges.size();
+        Offset += sizeof(dxbc::RTS0::v1::DescriptorRange) * Table.Ranges.size();
       else
-        Size += sizeof(dxbc::RTS0::v2::DescriptorRange) * Table.Ranges.size();
+        Offset += sizeof(dxbc::RTS0::v2::DescriptorRange) * Table.Ranges.size();
       break;
     }
   }
-  return Size;
+
+  return Offset;
 }
 
 void RootSignatureDesc::write(raw_ostream &OS) const {
@@ -76,19 +89,13 @@ void RootSignatureDesc::write(raw_ostream &OS) const {
   support::endian::write(BOS, NumParameters, llvm::endianness::little);
   support::endian::write(BOS, RootParameterOffset, llvm::endianness::little);
   support::endian::write(BOS, NumSamplers, llvm::endianness::little);
-  uint32_t SSO = StaticSamplersOffset;
-  if (NumSamplers > 0)
-    SSO = writePlaceholder(BOS);
-  else
-    support::endian::write(BOS, SSO, llvm::endianness::little);
+  uint32_t SSO = writePlaceholder(BOS);
   support::endian::write(BOS, Flags, llvm::endianness::little);
 
   SmallVector<uint32_t> ParamsOffsets;
-  for (const RootParameterInfo &P : ParametersContainer) {
-    support::endian::write(BOS, P.Header.ParameterType,
-                           llvm::endianness::little);
-    support::endian::write(BOS, P.Header.ShaderVisibility,
-                           llvm::endianness::little);
+  for (const RootParameterInfo &I : ParametersContainer) {
+    support::endian::write(BOS, I.Type, llvm::endianness::little);
+    support::endian::write(BOS, I.Visibility, llvm::endianness::little);
 
     ParamsOffsets.push_back(writePlaceholder(BOS));
   }
@@ -96,11 +103,11 @@ void RootSignatureDesc::write(raw_ostream &OS) const {
   assert(NumParameters == ParamsOffsets.size());
   for (size_t I = 0; I < NumParameters; ++I) {
     rewriteOffsetToCurrentByte(BOS, ParamsOffsets[I]);
-    const auto &[Type, Loc] = ParametersContainer.getTypeAndLocForParameter(I);
-    switch (Type) {
-    case llvm::to_underlying(dxbc::RootParameterType::Constants32Bit): {
-      const dxbc::RTS0::v1::RootConstants &Constants =
-          ParametersContainer.getConstant(Loc);
+    const RootParameterInfo &Info = ParametersContainer.getInfo(I);
+    switch (Info.Type) {
+    case dxbc::RootParameterType::Constants32Bit: {
+      const mcdxbc::RootConstants &Constants =
+          ParametersContainer.getConstant(Info.Location);
       support::endian::write(BOS, Constants.ShaderRegister,
                              llvm::endianness::little);
       support::endian::write(BOS, Constants.RegisterSpace,
@@ -109,11 +116,11 @@ void RootSignatureDesc::write(raw_ostream &OS) const {
                              llvm::endianness::little);
       break;
     }
-    case llvm::to_underlying(dxbc::RootParameterType::CBV):
-    case llvm::to_underlying(dxbc::RootParameterType::SRV):
-    case llvm::to_underlying(dxbc::RootParameterType::UAV): {
-      const dxbc::RTS0::v2::RootDescriptor &Descriptor =
-          ParametersContainer.getRootDescriptor(Loc);
+    case dxbc::RootParameterType::CBV:
+    case dxbc::RootParameterType::SRV:
+    case dxbc::RootParameterType::UAV: {
+      const mcdxbc::RootDescriptor &Descriptor =
+          ParametersContainer.getRootDescriptor(Info.Location);
 
       support::endian::write(BOS, Descriptor.ShaderRegister,
                              llvm::endianness::little);
@@ -123,14 +130,15 @@ void RootSignatureDesc::write(raw_ostream &OS) const {
         support::endian::write(BOS, Descriptor.Flags, llvm::endianness::little);
       break;
     }
-    case llvm::to_underlying(dxbc::RootParameterType::DescriptorTable): {
+    case dxbc::RootParameterType::DescriptorTable: {
       const DescriptorTable &Table =
-          ParametersContainer.getDescriptorTable(Loc);
+          ParametersContainer.getDescriptorTable(Info.Location);
       support::endian::write(BOS, (uint32_t)Table.Ranges.size(),
                              llvm::endianness::little);
       rewriteOffsetToCurrentByte(BOS, writePlaceholder(BOS));
       for (const auto &Range : Table) {
-        support::endian::write(BOS, Range.RangeType, llvm::endianness::little);
+        support::endian::write(BOS, static_cast<uint32_t>(Range.RangeType),
+                               llvm::endianness::little);
         support::endian::write(BOS, Range.NumDescriptors,
                                llvm::endianness::little);
         support::endian::write(BOS, Range.BaseShaderRegister,
@@ -146,23 +154,23 @@ void RootSignatureDesc::write(raw_ostream &OS) const {
     }
     }
   }
-  if (NumSamplers > 0) {
-    rewriteOffsetToCurrentByte(BOS, SSO);
-    for (const auto &S : StaticSamplers) {
-      support::endian::write(BOS, S.Filter, llvm::endianness::little);
-      support::endian::write(BOS, S.AddressU, llvm::endianness::little);
-      support::endian::write(BOS, S.AddressV, llvm::endianness::little);
-      support::endian::write(BOS, S.AddressW, llvm::endianness::little);
-      support::endian::write(BOS, S.MipLODBias, llvm::endianness::little);
-      support::endian::write(BOS, S.MaxAnisotropy, llvm::endianness::little);
-      support::endian::write(BOS, S.ComparisonFunc, llvm::endianness::little);
-      support::endian::write(BOS, S.BorderColor, llvm::endianness::little);
-      support::endian::write(BOS, S.MinLOD, llvm::endianness::little);
-      support::endian::write(BOS, S.MaxLOD, llvm::endianness::little);
-      support::endian::write(BOS, S.ShaderRegister, llvm::endianness::little);
-      support::endian::write(BOS, S.RegisterSpace, llvm::endianness::little);
-      support::endian::write(BOS, S.ShaderVisibility, llvm::endianness::little);
-    }
+  [[maybe_unused]] uint32_t Offset = rewriteOffsetToCurrentByte(BOS, SSO);
+  assert(Offset == computeStaticSamplersOffset() &&
+         "Computed offset does not match written offset");
+  for (const auto &S : StaticSamplers) {
+    support::endian::write(BOS, S.Filter, llvm::endianness::little);
+    support::endian::write(BOS, S.AddressU, llvm::endianness::little);
+    support::endian::write(BOS, S.AddressV, llvm::endianness::little);
+    support::endian::write(BOS, S.AddressW, llvm::endianness::little);
+    support::endian::write(BOS, S.MipLODBias, llvm::endianness::little);
+    support::endian::write(BOS, S.MaxAnisotropy, llvm::endianness::little);
+    support::endian::write(BOS, S.ComparisonFunc, llvm::endianness::little);
+    support::endian::write(BOS, S.BorderColor, llvm::endianness::little);
+    support::endian::write(BOS, S.MinLOD, llvm::endianness::little);
+    support::endian::write(BOS, S.MaxLOD, llvm::endianness::little);
+    support::endian::write(BOS, S.ShaderRegister, llvm::endianness::little);
+    support::endian::write(BOS, S.RegisterSpace, llvm::endianness::little);
+    support::endian::write(BOS, S.ShaderVisibility, llvm::endianness::little);
   }
   assert(Storage.size() == getSize());
   OS.write(Storage.data(), Storage.size());
diff --git a/llvm/lib/MC/MCDisassembler/Disassembler.cpp b/llvm/lib/MC/MCDisassembler/Disassembler.cpp
index 684413e1e3a5..0429227f0fec 100644
--- a/llvm/lib/MC/MCDisassembler/Disassembler.cpp
+++ b/llvm/lib/MC/MCDisassembler/Disassembler.cpp
@@ -17,7 +17,6 @@
 #include "llvm/MC/MCDisassembler/MCSymbolizer.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstPrinter.h"
-#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSchedule.h"
@@ -45,20 +44,23 @@ LLVMCreateDisasmCPUFeatures(const char *TT, const char *CPU,
                             const char *Features, void *DisInfo, int TagType,
                             LLVMOpInfoCallback GetOpInfo,
                             LLVMSymbolLookupCallback SymbolLookUp) {
+  Triple TheTriple(TT);
+
   // Get the target.
   std::string Error;
-  const Target *TheTarget = TargetRegistry::lookupTarget(TT, Error);
+  const Target *TheTarget = TargetRegistry::lookupTarget(TheTriple, Error);
   if (!TheTarget)
     return nullptr;
 
-  std::unique_ptr<const MCRegisterInfo> MRI(TheTarget->createMCRegInfo(TT));
+  std::unique_ptr<const MCRegisterInfo> MRI(
+      TheTarget->createMCRegInfo(TheTriple));
   if (!MRI)
     return nullptr;
 
   MCTargetOptions MCOptions;
   // Get the assembler info needed to setup the MCContext.
   std::unique_ptr<const MCAsmInfo> MAI(
-      TheTarget->createMCAsmInfo(*MRI, TT, MCOptions));
+      TheTarget->createMCAsmInfo(*MRI, TheTriple, MCOptions));
   if (!MAI)
     return nullptr;
 
@@ -67,13 +69,13 @@ LLVMCreateDisasmCPUFeatures(const char *TT, const char *CPU,
     return nullptr;
 
   std::unique_ptr<const MCSubtargetInfo> STI(
-      TheTarget->createMCSubtargetInfo(TT, CPU, Features));
+      TheTarget->createMCSubtargetInfo(TheTriple, CPU, Features));
   if (!STI)
     return nullptr;
 
   // Set up the MCContext for creating symbols and MCExpr's.
   std::unique_ptr<MCContext> Ctx(
-      new MCContext(Triple(TT), MAI.get(), MRI.get(), STI.get()));
+      new MCContext(TheTriple, MAI.get(), MRI.get(), STI.get()));
   if (!Ctx)
     return nullptr;
 
@@ -84,12 +86,13 @@ LLVMCreateDisasmCPUFeatures(const char *TT, const char *CPU,
     return nullptr;
 
   std::unique_ptr<MCRelocationInfo> RelInfo(
-      TheTarget->createMCRelocationInfo(TT, *Ctx));
+      TheTarget->createMCRelocationInfo(TheTriple, *Ctx));
   if (!RelInfo)
     return nullptr;
 
-  std::unique_ptr<MCSymbolizer> Symbolizer(TheTarget->createMCSymbolizer(
-      TT, GetOpInfo, SymbolLookUp, DisInfo, Ctx.get(), std::move(RelInfo)));
+  std::unique_ptr<MCSymbolizer> Symbolizer(
+      TheTarget->createMCSymbolizer(TheTriple, GetOpInfo, SymbolLookUp, DisInfo,
+                                    Ctx.get(), std::move(RelInfo)));
   DisAsm->setSymbolizer(std::move(Symbolizer));
 
   // Set up the instruction printer.
diff --git a/llvm/lib/MC/MCGOFFStreamer.cpp b/llvm/lib/MC/MCGOFFStreamer.cpp
index 1718e2a4eb2d..8b228db0e8b3 100644
--- a/llvm/lib/MC/MCGOFFStreamer.cpp
+++ b/llvm/lib/MC/MCGOFFStreamer.cpp
@@ -45,3 +45,9 @@ MCStreamer *llvm::createGOFFStreamer(MCContext &Context,
       new MCGOFFStreamer(Context, std::move(MAB), std::move(OW), std::move(CE));
   return S;
 }
+llvm::MCGOFFStreamer::MCGOFFStreamer(MCContext &Context,
+                                     std::unique_ptr<MCAsmBackend> MAB,
+                                     std::unique_ptr<MCObjectWriter> OW,
+                                     std::unique_ptr<MCCodeEmitter> Emitter)
+    : MCObjectStreamer(Context, std::move(MAB), std::move(OW),
+                       std::move(Emitter)) {}
diff --git a/llvm/lib/MC/MCObjectFileInfo.cpp b/llvm/lib/MC/MCObjectFileInfo.cpp
index d505ac6dd4bf..a0cd09b11d8d 100644
--- a/llvm/lib/MC/MCObjectFileInfo.cpp
+++ b/llvm/lib/MC/MCObjectFileInfo.cpp
@@ -849,6 +849,16 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) {
   StackMapSection = Ctx->getCOFFSection(".llvm_stackmaps",
                                         COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
                                             COFF::IMAGE_SCN_MEM_READ);
+
+  // Set IMAGE_SCN_MEM_DISCARDABLE so that lld will not truncate section name.
+  PseudoProbeSection = Ctx->getCOFFSection(
+      ".pseudo_probe", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                           COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                           COFF::IMAGE_SCN_MEM_READ);
+  PseudoProbeDescSection = Ctx->getCOFFSection(
+      ".pseudo_probe_desc", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                                COFF::IMAGE_SCN_MEM_READ);
 }
 
 void MCObjectFileInfo::initSPIRVMCObjectFileInfo(const Triple &T) {
@@ -1220,44 +1230,68 @@ MCObjectFileInfo::getKCFITrapSection(const MCSection &TextSec) const {
 
 MCSection *
 MCObjectFileInfo::getPseudoProbeSection(const MCSection &TextSec) const {
-  if (Ctx->getObjectFileType() != MCContext::IsELF)
-    return PseudoProbeSection;
-
-  const auto &ElfSec = static_cast<const MCSectionELF &>(TextSec);
-  unsigned Flags = ELF::SHF_LINK_ORDER;
-  StringRef GroupName;
-  if (const MCSymbol *Group = ElfSec.getGroup()) {
-    GroupName = Group->getName();
-    Flags |= ELF::SHF_GROUP;
+  auto ObjFileType = Ctx->getObjectFileType();
+  if (ObjFileType == MCContext::IsELF) {
+    const auto &ElfSec = static_cast<const MCSectionELF &>(TextSec);
+    unsigned Flags = ELF::SHF_LINK_ORDER;
+    StringRef GroupName;
+    if (const MCSymbol *Group = ElfSec.getGroup()) {
+      GroupName = Group->getName();
+      Flags |= ELF::SHF_GROUP;
+    }
+    return Ctx->getELFSection(
+        PseudoProbeSection->getName(), ELF::SHT_PROGBITS, Flags, 0, GroupName,
+        true, ElfSec.getUniqueID(),
+        static_cast<const MCSymbolELF *>(TextSec.getBeginSymbol()));
+  } else if (ObjFileType == MCContext::IsCOFF) {
+    StringRef COMDATSymName = "";
+    int Selection = 0;
+    unsigned Characteristics =
+        static_cast<MCSectionCOFF *>(PseudoProbeSection)->getCharacteristics();
+    const auto &COFFSec = static_cast<const MCSectionCOFF &>(TextSec);
+    if (const MCSymbol *COMDATSym = COFFSec.getCOMDATSymbol()) {
+      // Associate .pseudo_probe to its function section.
+      COMDATSymName = COMDATSym->getName();
+      Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT;
+      Selection = COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE;
+    }
+    return Ctx->getCOFFSection(PseudoProbeSection->getName(), Characteristics,
+                               COMDATSymName, Selection, COFFSec.getUniqueID());
   }
 
-  return Ctx->getELFSection(
-      PseudoProbeSection->getName(), ELF::SHT_PROGBITS, Flags, 0, GroupName,
-      true, ElfSec.getUniqueID(),
-      static_cast<const MCSymbolELF *>(TextSec.getBeginSymbol()));
+  return PseudoProbeSection;
 }
 
 MCSection *
 MCObjectFileInfo::getPseudoProbeDescSection(StringRef FuncName) const {
-  if (Ctx->getObjectFileType() == MCContext::IsELF) {
-    // Create a separate comdat group for each function's descriptor in order
-    // for the linker to deduplicate. The duplication, must be from different
-    // tranlation unit, can come from:
-    //  1. Inline functions defined in header files;
-    //  2. ThinLTO imported funcions;
-    //  3. Weak-linkage definitions.
-    // Use a concatenation of the section name and the function name as the
-    // group name so that descriptor-only groups won't be folded with groups of
-    // code.
-    if (Ctx->getTargetTriple().supportsCOMDAT() && !FuncName.empty()) {
-      auto *S = static_cast<MCSectionELF *>(PseudoProbeDescSection);
-      auto Flags = S->getFlags() | ELF::SHF_GROUP;
-      return Ctx->getELFSection(S->getName(), S->getType(), Flags,
-                                S->getEntrySize(),
-                                S->getName() + "_" + FuncName,
-                                /*IsComdat=*/true);
-    }
+  if (!Ctx->getTargetTriple().supportsCOMDAT() || FuncName.empty())
+    return PseudoProbeDescSection;
+
+  // Create a separate comdat group for each function's descriptor in order
+  // for the linker to deduplicate. The duplication, must be from different
+  // tranlation unit, can come from:
+  //  1. Inline functions defined in header files;
+  //  2. ThinLTO imported funcions;
+  //  3. Weak-linkage definitions.
+  // Use a concatenation of the section name and the function name as the
+  // group name so that descriptor-only groups won't be folded with groups of
+  // code.
+  auto ObjFileType = Ctx->getObjectFileType();
+  if (ObjFileType == MCContext::IsELF) {
+    auto *S = static_cast<MCSectionELF *>(PseudoProbeDescSection);
+    auto Flags = S->getFlags() | ELF::SHF_GROUP;
+    return Ctx->getELFSection(S->getName(), S->getType(), Flags,
+                              S->getEntrySize(), S->getName() + "_" + FuncName,
+                              /*IsComdat=*/true);
+  } else if (ObjFileType == MCContext::IsCOFF) {
+    auto *S = static_cast<MCSectionCOFF *>(PseudoProbeDescSection);
+    unsigned Characteristics =
+        S->getCharacteristics() | COFF::IMAGE_SCN_LNK_COMDAT;
+    std::string COMDATSymName = (S->getName() + "_" + FuncName).str();
+    return Ctx->getCOFFSection(S->getName(), Characteristics, COMDATSymName,
+                               COFF::IMAGE_COMDAT_SELECT_EXACT_MATCH);
   }
+
   return PseudoProbeDescSection;
 }
 
diff --git a/llvm/lib/MC/MCSFrame.cpp b/llvm/lib/MC/MCSFrame.cpp
index ee17b774e474..a0d6c80ab72e 100644
--- a/llvm/lib/MC/MCSFrame.cpp
+++ b/llvm/lib/MC/MCSFrame.cpp
@@ -21,6 +21,24 @@ using namespace sframe;
 
 namespace {
 
+// High-level structure to track info needed to emit a
+// sframe_frame_row_entry_addrX. On disk these have both a fixed portion of type
+// sframe_frame_row_entry_addrX and trailing data of X * S bytes, where X is the
+// datum size, and S is 1, 2, or 3 depending on which of CFA, SP, and FP are
+// being tracked.
+struct SFrameFRE {
+  // An FRE describes how to find the registers when the PC is at this
+  // Label from function start.
+  const MCSymbol *Label = nullptr;
+  size_t CFAOffset = 0;
+  size_t FPOffset = 0;
+  size_t RAOffset = 0;
+  bool FromFP = false;
+  bool CFARegSet = false;
+
+  SFrameFRE(const MCSymbol *Start) : Label(Start) {}
+};
+
 // High-level structure to track info needed to emit a sframe_func_desc_entry
 // and its associated FREs.
 struct SFrameFDE {
@@ -28,6 +46,8 @@ struct SFrameFDE {
   const MCDwarfFrameInfo &DFrame;
   // Label where this FDE's FREs start.
   MCSymbol *FREStart;
+  // Unwinding fres
+  SmallVector<SFrameFRE> FREs;
 
   SFrameFDE(const MCDwarfFrameInfo &DF, MCSymbol *FRES)
       : DFrame(DF), FREStart(FRES) {}
@@ -53,7 +73,8 @@ struct SFrameFDE {
                                 MCFixup::getDataKindForSize(4)));
     S.emitInt32(0);
 
-    // sfde_func_start_num_fres
+    // sfde_func_num_fres
+    // TODO: When we actually emit fres, replace 0 with FREs.size()
     S.emitInt32(0);
 
     // sfde_func_info word
@@ -76,10 +97,90 @@ class SFrameEmitterImpl {
   MCObjectStreamer &Streamer;
   SmallVector<SFrameFDE> FDEs;
   ABI SFrameABI;
+  // Target-specific convenience variables to detect when a CFI instruction
+  // references these registers. Unlike in dwarf frame descriptions, they never
+  // escape into the sframe section itself.
+  unsigned SPReg;
+  unsigned FPReg;
+  unsigned RAReg;
   MCSymbol *FDESubSectionStart;
   MCSymbol *FRESubSectionStart;
   MCSymbol *FRESubSectionEnd;
 
+  bool setCFARegister(SFrameFRE &FRE, const MCCFIInstruction &I) {
+    if (I.getRegister() == SPReg) {
+      FRE.CFARegSet = true;
+      FRE.FromFP = false;
+      return true;
+    }
+    if (I.getRegister() == FPReg) {
+      FRE.CFARegSet = true;
+      FRE.FromFP = true;
+      return true;
+    }
+    Streamer.getContext().reportWarning(
+        I.getLoc(), "canonical Frame Address not in stack- or frame-pointer. "
+                    "Omitting SFrame unwind info for this function");
+    return false;
+  }
+
+  bool setCFAOffset(SFrameFRE &FRE, const SMLoc &Loc, size_t Offset) {
+    if (!FRE.CFARegSet) {
+      Streamer.getContext().reportWarning(
+          Loc, "adjusting CFA offset without a base register. "
+               "Omitting SFrame unwind info for this function");
+      return false;
+    }
+    FRE.CFAOffset = Offset;
+    return true;
+  }
+
+  // Add the effects of CFI to the current FDE, creating a new FRE when
+  // necessary.
+  bool handleCFI(SFrameFDE &FDE, SFrameFRE &FRE, const MCCFIInstruction &CFI) {
+    switch (CFI.getOperation()) {
+    case MCCFIInstruction::OpDefCfaRegister:
+      return setCFARegister(FRE, CFI);
+    case MCCFIInstruction::OpDefCfa:
+    case MCCFIInstruction::OpLLVMDefAspaceCfa:
+      if (!setCFARegister(FRE, CFI))
+        return false;
+      return setCFAOffset(FRE, CFI.getLoc(), CFI.getOffset());
+    case MCCFIInstruction::OpOffset:
+      if (CFI.getRegister() == FPReg)
+        FRE.FPOffset = CFI.getOffset();
+      else if (CFI.getRegister() == RAReg)
+        FRE.RAOffset = CFI.getOffset();
+      return true;
+    case MCCFIInstruction::OpRelOffset:
+      if (CFI.getRegister() == FPReg)
+        FRE.FPOffset += CFI.getOffset();
+      else if (CFI.getRegister() == RAReg)
+        FRE.RAOffset += CFI.getOffset();
+      return true;
+    case MCCFIInstruction::OpDefCfaOffset:
+      return setCFAOffset(FRE, CFI.getLoc(), CFI.getOffset());
+    case MCCFIInstruction::OpAdjustCfaOffset:
+      return setCFAOffset(FRE, CFI.getLoc(), FRE.CFAOffset + CFI.getOffset());
+    case MCCFIInstruction::OpRememberState:
+      // TODO: Implement. Will use FDE.
+      return true;
+    case MCCFIInstruction::OpRestore:
+      // TODO: Implement. Will use FDE.
+      return true;
+    case MCCFIInstruction::OpRestoreState:
+      // TODO: Implement. Will use FDE.
+      return true;
+    case MCCFIInstruction::OpEscape:
+      // TODO: Implement. Will use FDE.
+      return true;
+    default:
+      // Instructions that don't affect the CFA, RA, and SP can be safely
+      // ignored.
+      return true;
+    }
+  }
+
 public:
   SFrameEmitterImpl(MCObjectStreamer &Streamer) : Streamer(Streamer) {
     assert(Streamer.getContext()
@@ -88,13 +189,96 @@ public:
                .has_value());
     FDEs.reserve(Streamer.getDwarfFrameInfos().size());
     SFrameABI = *Streamer.getContext().getObjectFileInfo()->getSFrameABIArch();
+    switch (SFrameABI) {
+    case ABI::AArch64EndianBig:
+    case ABI::AArch64EndianLittle:
+      SPReg = 31;
+      RAReg = 29;
+      FPReg = 30;
+      break;
+    case ABI::AMD64EndianLittle:
+      SPReg = 7;
+      // RARegister untracked in this abi. Value chosen to match
+      // MCDwarfFrameInfo constructor.
+      RAReg = static_cast<unsigned>(INT_MAX);
+      FPReg = 6;
+      break;
+    }
+
     FDESubSectionStart = Streamer.getContext().createTempSymbol();
     FRESubSectionStart = Streamer.getContext().createTempSymbol();
     FRESubSectionEnd = Streamer.getContext().createTempSymbol();
   }
 
-  void BuildSFDE(const MCDwarfFrameInfo &DF) {
-    FDEs.emplace_back(DF, Streamer.getContext().createTempSymbol());
+  bool atSameLocation(const MCSymbol *Left, const MCSymbol *Right) {
+    return Left != nullptr && Right != nullptr &&
+           Left->getFragment() == Right->getFragment() &&
+           Left->getOffset() == Right->getOffset();
+  }
+
+  bool equalIgnoringLocation(const SFrameFRE &Left, const SFrameFRE &Right) {
+    return Left.CFAOffset == Right.CFAOffset &&
+           Left.FPOffset == Right.FPOffset && Left.RAOffset == Right.RAOffset &&
+           Left.FromFP == Right.FromFP && Left.CFARegSet == Right.CFARegSet;
+  }
+
+  void buildSFDE(const MCDwarfFrameInfo &DF) {
+    bool Valid = true;
+    SFrameFDE FDE(DF, Streamer.getContext().createTempSymbol());
+    // This would have been set via ".cfi_return_column", but
+    // MCObjectStreamer doesn't emit an MCCFIInstruction for that. It just
+    // sets the DF.RAReg.
+    // FIXME: This also prevents providing a proper location for the error.
+    // LLVM doesn't change the return column itself, so this was
+    // hand-written assembly.
+    if (DF.RAReg != RAReg) {
+      Streamer.getContext().reportWarning(
+          SMLoc(), "non-default RA register in .cfi_return_column " +
+                       Twine(DF.RAReg) +
+                       ". Omitting SFrame unwind info for this function");
+      Valid = false;
+    }
+    MCSymbol *LastLabel = DF.Begin;
+    SFrameFRE BaseFRE(LastLabel);
+    if (!DF.IsSimple) {
+      for (const auto &CFI :
+           Streamer.getContext().getAsmInfo()->getInitialFrameState())
+        if (!handleCFI(FDE, BaseFRE, CFI))
+          Valid = false;
+    }
+    FDE.FREs.push_back(BaseFRE);
+
+    for (const auto &CFI : DF.Instructions) {
+      // Instructions from InitialFrameState may not have a label, but if these
+      // instructions don't, then they are in dead code or otherwise unused.
+      // TODO: This check follows MCDwarf.cpp
+      // FrameEmitterImplementation::emitCFIInstructions, but nothing in the
+      // testsuite triggers it. We should see if it can be removed in both
+      // places, or alternately, add a test to exercise it.
+      auto *L = CFI.getLabel();
+      if (L && !L->isDefined())
+        continue;
+
+      SFrameFRE FRE = FDE.FREs.back();
+      if (!handleCFI(FDE, FRE, CFI))
+        Valid = false;
+
+      // If nothing relevant but the location changed, don't add the FRE.
+      if (equalIgnoringLocation(FRE, FDE.FREs.back()))
+        continue;
+
+      // If the location stayed the same, then update the current
+      // row. Otherwise, add a new one.
+      if (atSameLocation(LastLabel, L))
+        FDE.FREs.back() = FRE;
+      else {
+        FDE.FREs.push_back(FRE);
+        FDE.FREs.back().Label = L;
+        LastLabel = L;
+      }
+    }
+    if (Valid)
+      FDEs.push_back(FDE);
   }
 
   void emitPreamble() {
@@ -116,7 +300,9 @@ public:
     // shf_num_fdes
     Streamer.emitInt32(FDEs.size());
     // shf_num_fres
-    Streamer.emitInt32(0);
+    uint32_t TotalFREs = 0;
+    Streamer.emitInt32(TotalFREs);
+
     // shf_fre_len
     Streamer.emitAbsoluteSymbolDiff(FRESubSectionEnd, FRESubSectionStart,
                                     sizeof(int32_t));
@@ -161,7 +347,7 @@ void MCSFrameEmitter::emit(MCObjectStreamer &Streamer) {
   // Both the header itself and the FDEs include various offsets and counts.
   // Therefore, all of this must be precomputed.
   for (const auto &DFrame : FrameArray)
-    Emitter.BuildSFDE(DFrame);
+    Emitter.buildSFDE(DFrame);
 
   MCSection *Section = Context.getObjectFileInfo()->getSFrameSection();
   // Not strictly necessary, but gas always aligns to 8, so match that.
diff --git a/llvm/lib/MC/MCWin64EH.cpp b/llvm/lib/MC/MCWin64EH.cpp
index a87648afde7d..8111ccb8bc69 100644
--- a/llvm/lib/MC/MCWin64EH.cpp
+++ b/llvm/lib/MC/MCWin64EH.cpp
@@ -22,6 +22,7 @@ class MCSection;
 
 /// MCExpr that represents the epilog unwind code in an unwind table.
 class MCUnwindV2EpilogTargetExpr final : public MCTargetExpr {
+  const MCSymbol *Function;
   const MCSymbol *FunctionEnd;
   const MCSymbol *UnwindV2Start;
   const MCSymbol *EpilogEnd;
@@ -31,7 +32,7 @@ class MCUnwindV2EpilogTargetExpr final : public MCTargetExpr {
   MCUnwindV2EpilogTargetExpr(const WinEH::FrameInfo &FrameInfo,
                              const WinEH::FrameInfo::Epilog &Epilog,
                              uint8_t EpilogSize_)
-      : FunctionEnd(FrameInfo.FuncletOrFuncEnd),
+      : Function(FrameInfo.Function), FunctionEnd(FrameInfo.FuncletOrFuncEnd),
         UnwindV2Start(Epilog.UnwindV2Start), EpilogEnd(Epilog.End),
         EpilogSize(EpilogSize_), Loc(Epilog.Loc) {}
 
@@ -253,13 +254,15 @@ static void EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
         OS->getAssembler(), LastEpilog.End, LastEpilog.UnwindV2Start);
     if (!MaybeSize) {
       context.reportError(LastEpilog.Loc,
-                          "Failed to evaluate epilog size for Unwind v2");
+                          "Failed to evaluate epilog size for Unwind v2 in " +
+                              info->Function->getName());
       return;
     }
     assert(*MaybeSize >= 0);
     if (*MaybeSize >= (int64_t)UINT8_MAX) {
       context.reportError(LastEpilog.Loc,
-                          "Epilog size is too large for Unwind v2");
+                          "Epilog size is too large for Unwind v2 in " +
+                              info->Function->getName());
       return;
     }
     EpilogSize = *MaybeSize + 1;
@@ -282,7 +285,8 @@ static void EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
     // Too many epilogs to handle.
     if ((size_t)numCodes + numEpilogCodes > UINT8_MAX) {
       context.reportError(info->FunctionLoc,
-                          "Too many unwind codes with Unwind v2 enabled");
+                          "Too many unwind codes with Unwind v2 enabled in " +
+                              info->Function->getName());
       return;
     }
 
@@ -383,14 +387,16 @@ bool MCUnwindV2EpilogTargetExpr::evaluateAsRelocatableImpl(
   auto Offset = GetOptionalAbsDifference(*Asm, FunctionEnd, UnwindV2Start);
   if (!Offset) {
     Asm->getContext().reportError(
-        Loc, "Failed to evaluate epilog offset for Unwind v2");
+        Loc, "Failed to evaluate epilog offset for Unwind v2 in " +
+                 Function->getName());
     return false;
   }
   assert(*Offset > 0);
   constexpr uint16_t MaxEpilogOffset = 0x0fff;
   if (*Offset > MaxEpilogOffset) {
-    Asm->getContext().reportError(Loc,
-                                  "Epilog offset is too large for Unwind v2");
+    Asm->getContext().reportError(
+        Loc,
+        "Epilog offset is too large for Unwind v2 in " + Function->getName());
     return false;
   }
 
@@ -398,8 +404,8 @@ bool MCUnwindV2EpilogTargetExpr::evaluateAsRelocatableImpl(
   auto Size = GetOptionalAbsDifference(*Asm, EpilogEnd, UnwindV2Start);
   if (Size != (EpilogSize - 1)) {
     Asm->getContext().reportError(
-        Loc,
-        "Size of this epilog does not match size of last epilog in function");
+        Loc, "Size of this epilog does not match size of last epilog in " +
+                 Function->getName());
     return false;
   }
 
diff --git a/llvm/lib/ObjCopy/COFF/COFFObject.cpp b/llvm/lib/ObjCopy/COFF/COFFObject.cpp
index 5fa13391c908..91cf7e32a739 100644
--- a/llvm/lib/ObjCopy/COFF/COFFObject.cpp
+++ b/llvm/lib/ObjCopy/COFF/COFFObject.cpp
@@ -18,6 +18,8 @@ using namespace object;
 void Object::addSymbols(ArrayRef<Symbol> NewSymbols) {
   for (Symbol S : NewSymbols) {
     S.UniqueId = NextSymbolUniqueId++;
+    S.OriginalRawIndex = NextSymbolOriginalIndex;
+    NextSymbolOriginalIndex += 1 + S.Sym.NumberOfAuxSymbols;
     Symbols.emplace_back(S);
   }
   updateSymbols();
diff --git a/llvm/lib/ObjCopy/COFF/COFFObject.h b/llvm/lib/ObjCopy/COFF/COFFObject.h
index cdd1f17fc605..6b70add1bb1b 100644
--- a/llvm/lib/ObjCopy/COFF/COFFObject.h
+++ b/llvm/lib/ObjCopy/COFF/COFFObject.h
@@ -89,6 +89,7 @@ struct Symbol {
   std::optional<size_t> WeakTargetSymbolId;
   size_t UniqueId;
   size_t RawIndex;
+  size_t OriginalRawIndex;
   bool Referenced;
 };
 
@@ -140,6 +141,7 @@ private:
   DenseMap<size_t, Symbol *> SymbolMap;
 
   size_t NextSymbolUniqueId = 0;
+  size_t NextSymbolOriginalIndex = 0;
 
   std::vector<Section> Sections;
   DenseMap<ssize_t, Section *> SectionMap;
diff --git a/llvm/lib/ObjCopy/COFF/COFFWriter.cpp b/llvm/lib/ObjCopy/COFF/COFFWriter.cpp
index 350c4aec572c..fed67d67f13a 100644
--- a/llvm/lib/ObjCopy/COFF/COFFWriter.cpp
+++ b/llvm/lib/ObjCopy/COFF/COFFWriter.cpp
@@ -12,6 +12,8 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/Object/COFF.h"
+#include "llvm/Support/CRC.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <cstddef>
@@ -92,6 +94,77 @@ Error COFFWriter::finalizeSymbolContents() {
   return Error::success();
 }
 
+Error COFFWriter::finalizeSymIdxContents() {
+  // CFGuards shouldn't be present in PE.
+  if (Obj.IsPE)
+    return Error::success();
+
+  // Currently handle only sections consisting only of .symidx.
+  // TODO: other sections such as .impcall and .hybmp$x require more complex
+  // handling as they have more complex layout.
+  auto IsSymIdxSection = [](StringRef Name) {
+    return Name == ".gljmp$y" || Name == ".giats$y" || Name == ".gfids$y" ||
+           Name == ".gehcont$y";
+  };
+
+  DenseMap<size_t, size_t> SymIdMap;
+  SmallDenseMap<ssize_t, coff_aux_section_definition *, 4> SecIdMap;
+  for (Symbol &Sym : Obj.getMutableSymbols()) {
+    SymIdMap[Sym.OriginalRawIndex] = Sym.RawIndex;
+
+    // We collect only definition symbols of the sections to update the
+    // checksums.
+    if (Sym.Sym.StorageClass == IMAGE_SYM_CLASS_STATIC &&
+        Sym.Sym.NumberOfAuxSymbols == 1 && Sym.Sym.Value == 0 &&
+        IsSymIdxSection(Sym.Name))
+      SecIdMap[Sym.TargetSectionId] =
+          reinterpret_cast<coff_aux_section_definition *>(
+              Sym.AuxData[0].Opaque);
+  }
+
+  for (Section &Sec : Obj.getMutableSections()) {
+    if (!IsSymIdxSection(Sec.Name))
+      continue;
+
+    ArrayRef<uint8_t> RawIds = Sec.getContents();
+    // Nothing to do and also the checksum will be -1 instead of 0 if we
+    // recalculate it on empty input.
+    if (RawIds.size() == 0)
+      continue;
+
+    auto SecDefIt = SecIdMap.find(Sec.UniqueId);
+    if (SecDefIt == SecIdMap.end())
+      return createStringError(object_error::invalid_symbol_index,
+                               "section '%s' does not have the corresponding "
+                               "symbol or the symbol has unexpected format",
+                               Sec.Name.str().c_str());
+
+    // Create updated content.
+    ArrayRef<support::ulittle32_t> Ids(
+        reinterpret_cast<const support::ulittle32_t *>(RawIds.data()),
+        RawIds.size() / 4);
+    std::vector<support::ulittle32_t> NewIds;
+    for (support::ulittle32_t Id : Ids) {
+      auto SymIdIt = SymIdMap.find(Id);
+      if (SymIdIt == SymIdMap.end())
+        return createStringError(object_error::invalid_symbol_index,
+                                 "section '%s' contains a .symidx (%d) that is "
+                                 "incorrect or was stripped",
+                                 Sec.Name.str().c_str(), Id.value());
+      NewIds.push_back(support::ulittle32_t(SymIdIt->getSecond()));
+    }
+    ArrayRef<uint8_t> NewRawIds(reinterpret_cast<uint8_t *>(NewIds.data()),
+                                RawIds.size());
+    // Update the checksum.
+    JamCRC JC(/*Init=*/0);
+    JC.update(NewRawIds);
+    SecDefIt->getSecond()->CheckSum = JC.getCRC();
+    // Set new content.
+    Sec.setOwnedContents(NewRawIds.vec());
+  }
+  return Error::success();
+}
+
 void COFFWriter::layoutSections() {
   for (auto &S : Obj.getMutableSections()) {
     if (S.Header.SizeOfRawData > 0)
@@ -183,6 +256,8 @@ Error COFFWriter::finalize(bool IsBigObj) {
     return E;
   if (Error E = finalizeSymbolContents())
     return E;
+  if (Error E = finalizeSymIdxContents())
+    return E;
 
   size_t SizeOfHeaders = 0;
   FileAlignment = 1;
diff --git a/llvm/lib/ObjCopy/COFF/COFFWriter.h b/llvm/lib/ObjCopy/COFF/COFFWriter.h
index b7dca69e9a81..66d7f01c87f1 100644
--- a/llvm/lib/ObjCopy/COFF/COFFWriter.h
+++ b/llvm/lib/ObjCopy/COFF/COFFWriter.h
@@ -34,6 +34,7 @@ class COFFWriter {
   template <class SymbolTy> std::pair<size_t, size_t> finalizeSymbolTable();
   Error finalizeRelocTargets();
   Error finalizeSymbolContents();
+  Error finalizeSymIdxContents();
   void layoutSections();
   Expected<size_t> finalizeStringTable();
 
diff --git a/llvm/lib/ObjCopy/ConfigManager.cpp b/llvm/lib/ObjCopy/ConfigManager.cpp
index 2b17d728aad3..eef8a2190c4d 100644
--- a/llvm/lib/ObjCopy/ConfigManager.cpp
+++ b/llvm/lib/ObjCopy/ConfigManager.cpp
@@ -13,6 +13,13 @@
 using namespace llvm;
 using namespace llvm::objcopy;
 
+Expected<const ELFConfig &> ConfigManager::getELFConfig() const {
+  if (!Common.ExtractSection.empty())
+    return createStringError(llvm::errc::invalid_argument,
+                             "option is not supported for ELF");
+  return ELF;
+}
+
 Expected<const COFFConfig &> ConfigManager::getCOFFConfig() const {
   if (!Common.SplitDWO.empty() || !Common.SymbolsPrefix.empty() ||
       !Common.SymbolsPrefixRemove.empty() || !Common.SymbolsToSkip.empty() ||
@@ -27,7 +34,7 @@ Expected<const COFFConfig &> ConfigManager::getCOFFConfig() const {
       Common.DiscardMode == DiscardType::Locals ||
       !Common.SymbolsToAdd.empty() || Common.GapFill != 0 ||
       Common.PadTo != 0 || Common.ChangeSectionLMAValAll != 0 ||
-      !Common.ChangeSectionAddress.empty())
+      !Common.ChangeSectionAddress.empty() || !Common.ExtractSection.empty())
     return createStringError(llvm::errc::invalid_argument,
                              "option is not supported for COFF");
 
@@ -48,7 +55,7 @@ Expected<const MachOConfig &> ConfigManager::getMachOConfig() const {
       Common.DiscardMode == DiscardType::Locals ||
       !Common.SymbolsToAdd.empty() || Common.GapFill != 0 ||
       Common.PadTo != 0 || Common.ChangeSectionLMAValAll != 0 ||
-      !Common.ChangeSectionAddress.empty())
+      !Common.ChangeSectionAddress.empty() || !Common.ExtractSection.empty())
     return createStringError(llvm::errc::invalid_argument,
                              "option is not supported for MachO");
 
@@ -69,7 +76,7 @@ Expected<const WasmConfig &> ConfigManager::getWasmConfig() const {
       !Common.SetSectionFlags.empty() || !Common.SetSectionType.empty() ||
       !Common.SymbolsToRename.empty() || Common.GapFill != 0 ||
       Common.PadTo != 0 || Common.ChangeSectionLMAValAll != 0 ||
-      !Common.ChangeSectionAddress.empty())
+      !Common.ChangeSectionAddress.empty() || !Common.ExtractSection.empty())
     return createStringError(llvm::errc::invalid_argument,
                              "only flags for section dumping, removal, and "
                              "addition are supported");
@@ -99,7 +106,7 @@ Expected<const XCOFFConfig &> ConfigManager::getXCOFFConfig() const {
       Common.Weaken || Common.StripUnneeded || Common.DecompressDebugSections ||
       Common.GapFill != 0 || Common.PadTo != 0 ||
       Common.ChangeSectionLMAValAll != 0 ||
-      !Common.ChangeSectionAddress.empty()) {
+      !Common.ChangeSectionAddress.empty() || !Common.ExtractSection.empty()) {
     return createStringError(
         llvm::errc::invalid_argument,
         "no flags are supported yet, only basic copying is allowed");
@@ -116,18 +123,16 @@ ConfigManager::getDXContainerConfig() const {
       !Common.AllocSectionsPrefix.empty() ||
       Common.DiscardMode != DiscardType::None || !Common.AddSection.empty() ||
       !Common.DumpSection.empty() || !Common.KeepSection.empty() ||
-      !Common.OnlySection.empty() || !Common.SectionsToRename.empty() ||
-      !Common.SetSectionAlignment.empty() || !Common.SetSectionFlags.empty() ||
-      !Common.SetSectionType.empty() || Common.ExtractDWO ||
-      Common.OnlyKeepDebug || Common.StripAllGNU || Common.StripDWO ||
-      Common.StripDebug || Common.StripNonAlloc || Common.StripSections ||
-      Common.StripUnneeded || Common.DecompressDebugSections ||
-      Common.GapFill != 0 || Common.PadTo != 0 ||
-      Common.ChangeSectionLMAValAll != 0 ||
+      !Common.SectionsToRename.empty() || !Common.SetSectionAlignment.empty() ||
+      !Common.SetSectionFlags.empty() || !Common.SetSectionType.empty() ||
+      Common.ExtractDWO || Common.OnlyKeepDebug || Common.StripAllGNU ||
+      Common.StripDWO || Common.StripDebug || Common.StripNonAlloc ||
+      Common.StripSections || Common.StripUnneeded ||
+      Common.DecompressDebugSections || Common.GapFill != 0 ||
+      Common.PadTo != 0 || Common.ChangeSectionLMAValAll != 0 ||
       !Common.ChangeSectionAddress.empty()) {
-    return createStringError(
-        llvm::errc::invalid_argument,
-        "no flags are supported yet, only basic copying is allowed");
+    return createStringError(llvm::errc::invalid_argument,
+                             "option is not supported for DXContainer");
   }
   return DXContainer;
 }
diff --git a/llvm/lib/ObjCopy/DXContainer/DXContainerObjcopy.cpp b/llvm/lib/ObjCopy/DXContainer/DXContainerObjcopy.cpp
index 375e382ddb04..d7f3c0d1f7b3 100644
--- a/llvm/lib/ObjCopy/DXContainer/DXContainerObjcopy.cpp
+++ b/llvm/lib/ObjCopy/DXContainer/DXContainerObjcopy.cpp
@@ -11,6 +11,7 @@
 #include "DXContainerWriter.h"
 #include "llvm/ObjCopy/CommonConfig.h"
 #include "llvm/ObjCopy/DXContainer/DXContainerConfig.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
 namespace objcopy {
@@ -18,7 +19,40 @@ namespace dxbc {
 
 using namespace object;
 
+static Error extractPartAsObject(StringRef PartName, StringRef OutFilename,
+                                 StringRef InputFilename, const Object &Obj) {
+  for (const Part &P : Obj.Parts)
+    if (P.Name == PartName) {
+      Object PartObj;
+      PartObj.Header = Obj.Header;
+      PartObj.Parts.push_back({P.Name, P.Data});
+      PartObj.recomputeHeader();
+
+      auto Write = [&OutFilename, &PartObj](raw_ostream &Out) -> Error {
+        DXContainerWriter Writer(PartObj, Out);
+        if (Error E = Writer.write())
+          return createFileError(OutFilename, std::move(E));
+        return Error::success();
+      };
+
+      return writeToOutput(OutFilename, Write);
+    }
+
+  return createFileError(InputFilename, object_error::parse_failed,
+                         "part '%s' not found", PartName.str().c_str());
+}
+
 static Error handleArgs(const CommonConfig &Config, Object &Obj) {
+  // Extract all sections before any modifications.
+  for (StringRef Flag : Config.ExtractSection) {
+    StringRef SectionName;
+    StringRef FileName;
+    std::tie(SectionName, FileName) = Flag.split('=');
+    if (Error E = extractPartAsObject(SectionName, FileName,
+                                      Config.InputFilename, Obj))
+      return E;
+  }
+
   std::function<bool(const Part &)> RemovePred = [](const Part &) {
     return false;
   };
@@ -28,6 +62,13 @@ static Error handleArgs(const CommonConfig &Config, Object &Obj) {
       return Config.ToRemove.matches(P.Name);
     };
 
+  if (!Config.OnlySection.empty())
+    RemovePred = [&Config](const Part &P) {
+      // Explicitly keep these sections regardless of previous removes and
+      // remove everything else.
+      return !Config.OnlySection.matches(P.Name);
+    };
+
   if (auto E = Obj.removeParts(RemovePred))
     return E;
 
diff --git a/llvm/lib/ObjCopy/ELF/ELFObject.cpp b/llvm/lib/ObjCopy/ELF/ELFObject.cpp
index e5de17e093df..78b674c5fa34 100644
--- a/llvm/lib/ObjCopy/ELF/ELFObject.cpp
+++ b/llvm/lib/ObjCopy/ELF/ELFObject.cpp
@@ -1307,6 +1307,9 @@ Error BasicELFBuilder::initSections() {
   return Error::success();
 }
 
+BasicELFBuilder::BasicELFBuilder() : Obj(std::make_unique<Object>()) {}
+BasicELFBuilder::~BasicELFBuilder() = default;
+
 void BinaryELFBuilder::addData(SymbolTableSection *SymTab) {
   auto Data = ArrayRef<uint8_t>(
       reinterpret_cast<const uint8_t *>(MemBuf->getBufferStart()),
diff --git a/llvm/lib/ObjCopy/ELF/ELFObject.h b/llvm/lib/ObjCopy/ELF/ELFObject.h
index d8f79a4b1a3c..7ec0e9be3dda 100644
--- a/llvm/lib/ObjCopy/ELF/ELFObject.h
+++ b/llvm/lib/ObjCopy/ELF/ELFObject.h
@@ -1059,7 +1059,8 @@ protected:
   Error initSections();
 
 public:
-  BasicELFBuilder() : Obj(std::make_unique<Object>()) {}
+  BasicELFBuilder();
+  ~BasicELFBuilder();
 };
 
 class BinaryELFBuilder : public BasicELFBuilder {
diff --git a/llvm/lib/Object/ELF.cpp b/llvm/lib/Object/ELF.cpp
index 788c6020a7f9..53699ce0d4fc 100644
--- a/llvm/lib/Object/ELF.cpp
+++ b/llvm/lib/Object/ELF.cpp
@@ -847,7 +847,7 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF,
     if (!FeatEnableOrErr)
       return FeatEnableOrErr.takeError();
     FeatEnable = *FeatEnableOrErr;
-    if (FeatEnable.CallsiteOffsets && Version < 3)
+    if (FeatEnable.CallsiteEndOffsets && Version < 3)
       return createError("version should be >= 3 for SHT_LLVM_BB_ADDR_MAP when "
                          "callsite offsets feature is enabled: version = " +
                          Twine(static_cast<int>(Version)) +
@@ -890,22 +890,22 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF,
           uint32_t ID = readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr);
           uint32_t Offset = readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr);
           // Read the callsite offsets.
-          uint32_t LastCallsiteOffset = 0;
-          SmallVector<uint32_t, 1> CallsiteOffsets;
-          if (FeatEnable.CallsiteOffsets) {
+          uint32_t LastCallsiteEndOffset = 0;
+          SmallVector<uint32_t, 1> CallsiteEndOffsets;
+          if (FeatEnable.CallsiteEndOffsets) {
             uint32_t NumCallsites =
                 readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr);
-            CallsiteOffsets.reserve(NumCallsites);
+            CallsiteEndOffsets.reserve(NumCallsites);
             for (uint32_t CallsiteIndex = 0;
                  !ULEBSizeErr && Cur && (CallsiteIndex < NumCallsites);
                  ++CallsiteIndex) {
-              LastCallsiteOffset +=
+              LastCallsiteEndOffset +=
                   readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr);
-              CallsiteOffsets.push_back(LastCallsiteOffset);
+              CallsiteEndOffsets.push_back(LastCallsiteEndOffset);
             }
           }
           uint32_t Size = readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr) +
-                          LastCallsiteOffset;
+                          LastCallsiteEndOffset;
           uint32_t MD = readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr);
           Expected<BBAddrMap::BBEntry::Metadata> MetadataOrErr =
               BBAddrMap::BBEntry::Metadata::decode(MD);
@@ -914,7 +914,7 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF,
             break;
           }
           BBEntries.push_back({ID, Offset + PrevBBEndOffset, Size,
-                               *MetadataOrErr, CallsiteOffsets});
+                               *MetadataOrErr, CallsiteEndOffsets});
           PrevBBEndOffset += Offset + Size;
         }
         TotalNumBlocks += BBEntries.size();
diff --git a/llvm/lib/Object/ModuleSymbolTable.cpp b/llvm/lib/Object/ModuleSymbolTable.cpp
index 170677291277..9442becdb7d3 100644
--- a/llvm/lib/Object/ModuleSymbolTable.cpp
+++ b/llvm/lib/Object/ModuleSymbolTable.cpp
@@ -81,17 +81,16 @@ initializeRecordStreamer(const Module &M,
   const Target *T = TargetRegistry::lookupTarget(TT, Err);
   assert(T && T->hasMCAsmParser());
 
-  std::unique_ptr<MCRegisterInfo> MRI(T->createMCRegInfo(TT.str()));
+  std::unique_ptr<MCRegisterInfo> MRI(T->createMCRegInfo(TT));
   if (!MRI)
     return;
 
   MCTargetOptions MCOptions;
-  std::unique_ptr<MCAsmInfo> MAI(T->createMCAsmInfo(*MRI, TT.str(), MCOptions));
+  std::unique_ptr<MCAsmInfo> MAI(T->createMCAsmInfo(*MRI, TT, MCOptions));
   if (!MAI)
     return;
 
-  std::unique_ptr<MCSubtargetInfo> STI(
-      T->createMCSubtargetInfo(TT.str(), "", ""));
+  std::unique_ptr<MCSubtargetInfo> STI(T->createMCSubtargetInfo(TT, "", ""));
   if (!STI)
     return;
 
diff --git a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp
index 043b575a43b1..1078b1188bb6 100644
--- a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp
+++ b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp
@@ -38,7 +38,7 @@ private:
   Error validateSize(uint32_t Computed);
 
   void writeHeader(raw_ostream &OS);
-  void writeParts(raw_ostream &OS);
+  Error writeParts(raw_ostream &OS);
 };
 } // namespace
 
@@ -107,7 +107,7 @@ void DXContainerWriter::writeHeader(raw_ostream &OS) {
            Offsets.size() * sizeof(uint32_t));
 }
 
-void DXContainerWriter::writeParts(raw_ostream &OS) {
+Error DXContainerWriter::writeParts(raw_ostream &OS) {
   uint32_t RollingOffset =
       sizeof(dxbc::Header) + (ObjectFile.Header.PartCount * sizeof(uint32_t));
   for (auto I : llvm::zip(ObjectFile.Parts, *ObjectFile.Header.PartOffsets)) {
@@ -269,65 +269,68 @@ void DXContainerWriter::writeParts(raw_ostream &OS) {
       mcdxbc::RootSignatureDesc RS;
       RS.Flags = P.RootSignature->getEncodedFlags();
       RS.Version = P.RootSignature->Version;
-      RS.RootParameterOffset = P.RootSignature->RootParametersOffset;
       RS.NumStaticSamplers = P.RootSignature->NumStaticSamplers;
-      RS.StaticSamplersOffset = P.RootSignature->StaticSamplersOffset;
 
       for (DXContainerYAML::RootParameterLocationYaml &L :
            P.RootSignature->Parameters.Locations) {
-        dxbc::RTS0::v1::RootParameterHeader Header{L.Header.Type, L.Header.Visibility,
-                                         L.Header.Offset};
 
-        switch (L.Header.Type) {
-        case llvm::to_underlying(dxbc::RootParameterType::Constants32Bit): {
+        assert(dxbc::isValidParameterType(L.Header.Type) &&
+               "invalid DXContainer YAML");
+        assert(dxbc::isValidShaderVisibility(L.Header.Visibility) &&
+               "invalid DXContainer YAML");
+        dxbc::RootParameterType Type = dxbc::RootParameterType(L.Header.Type);
+        dxbc::ShaderVisibility Visibility =
+            dxbc::ShaderVisibility(L.Header.Visibility);
+
+        switch (Type) {
+        case dxbc::RootParameterType::Constants32Bit: {
           const DXContainerYAML::RootConstantsYaml &ConstantYaml =
               P.RootSignature->Parameters.getOrInsertConstants(L);
-          dxbc::RTS0::v1::RootConstants Constants;
+          mcdxbc::RootConstants Constants;
+
           Constants.Num32BitValues = ConstantYaml.Num32BitValues;
           Constants.RegisterSpace = ConstantYaml.RegisterSpace;
           Constants.ShaderRegister = ConstantYaml.ShaderRegister;
-          RS.ParametersContainer.addParameter(Header, Constants);
+          RS.ParametersContainer.addParameter(Type, Visibility, Constants);
           break;
         }
-        case llvm::to_underlying(dxbc::RootParameterType::CBV):
-        case llvm::to_underlying(dxbc::RootParameterType::SRV):
-        case llvm::to_underlying(dxbc::RootParameterType::UAV): {
+        case dxbc::RootParameterType::CBV:
+        case dxbc::RootParameterType::SRV:
+        case dxbc::RootParameterType::UAV: {
           const DXContainerYAML::RootDescriptorYaml &DescriptorYaml =
               P.RootSignature->Parameters.getOrInsertDescriptor(L);
 
-          dxbc::RTS0::v2::RootDescriptor Descriptor;
+          mcdxbc::RootDescriptor Descriptor;
           Descriptor.RegisterSpace = DescriptorYaml.RegisterSpace;
           Descriptor.ShaderRegister = DescriptorYaml.ShaderRegister;
           if (RS.Version > 1)
             Descriptor.Flags = DescriptorYaml.getEncodedFlags();
-          RS.ParametersContainer.addParameter(Header, Descriptor);
+          RS.ParametersContainer.addParameter(Type, Visibility, Descriptor);
           break;
         }
-        case llvm::to_underlying(dxbc::RootParameterType::DescriptorTable): {
+        case dxbc::RootParameterType::DescriptorTable: {
           const DXContainerYAML::DescriptorTableYaml &TableYaml =
               P.RootSignature->Parameters.getOrInsertTable(L);
           mcdxbc::DescriptorTable Table;
           for (const auto &R : TableYaml.Ranges) {
-
-            dxbc::RTS0::v2::DescriptorRange Range;
-            Range.RangeType = R.RangeType;
+            assert(dxbc::isValidRangeType(R.RangeType) &&
+                   "Invalid Descriptor Range Type");
+            mcdxbc::DescriptorRange Range;
+            Range.RangeType = dxil::ResourceClass(R.RangeType);
             Range.NumDescriptors = R.NumDescriptors;
             Range.BaseShaderRegister = R.BaseShaderRegister;
             Range.RegisterSpace = R.RegisterSpace;
             Range.OffsetInDescriptorsFromTableStart =
                 R.OffsetInDescriptorsFromTableStart;
+
             if (RS.Version > 1)
               Range.Flags = R.getEncodedFlags();
+
             Table.Ranges.push_back(Range);
           }
-          RS.ParametersContainer.addParameter(Header, Table);
+          RS.ParametersContainer.addParameter(Type, Visibility, Table);
           break;
         }
-        default:
-          // Handling invalid parameter type edge case. We intentionally let
-          // obj2yaml/yaml2obj parse and emit invalid dxcontainer data, in order
-          // for that to be used as a testing tool more effectively.
-          RS.ParametersContainer.addInvalidParameter(Header);
         }
       }
 
@@ -350,6 +353,27 @@ void DXContainerWriter::writeParts(raw_ostream &OS) {
         RS.StaticSamplers.push_back(NewSampler);
       }
 
+      // Handling of offsets
+      RS.RootParameterOffset = RS.computeRootParametersOffset();
+      if (P.RootSignature->RootParametersOffset &&
+          P.RootSignature->RootParametersOffset.value() !=
+              RS.RootParameterOffset) {
+        return createStringError(
+            errc::invalid_argument,
+            "Specified RootParametersOffset does not match required value: %d.",
+            RS.RootParameterOffset);
+      }
+
+      RS.StaticSamplersOffset = RS.computeStaticSamplersOffset();
+      if (P.RootSignature->StaticSamplersOffset &&
+          P.RootSignature->StaticSamplersOffset.value() !=
+              RS.StaticSamplersOffset) {
+        return createStringError(
+            errc::invalid_argument,
+            "Specified StaticSamplersOffset does not match computed value: %d.",
+            RS.StaticSamplersOffset);
+      }
+
       RS.write(OS);
       break;
     }
@@ -359,14 +383,15 @@ void DXContainerWriter::writeParts(raw_ostream &OS) {
       OS.write_zeros(PartSize - BytesWritten);
     RollingOffset += PartSize;
   }
+
+  return Error::success();
 }
 
 Error DXContainerWriter::write(raw_ostream &OS) {
   if (Error Err = computePartOffsets())
     return Err;
   writeHeader(OS);
-  writeParts(OS);
-  return Error::success();
+  return writeParts(OS);
 }
 
 namespace llvm {
diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp
index 263f7bdf37bc..32b502ed4e21 100644
--- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp
+++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp
@@ -376,9 +376,9 @@ void MappingTraits<DXContainerYAML::RootSignatureYamlDesc>::mapping(
     IO &IO, DXContainerYAML::RootSignatureYamlDesc &S) {
   IO.mapRequired("Version", S.Version);
   IO.mapRequired("NumRootParameters", S.NumRootParameters);
-  IO.mapRequired("RootParametersOffset", S.RootParametersOffset);
+  IO.mapOptional("RootParametersOffset", S.RootParametersOffset, std::nullopt);
   IO.mapRequired("NumStaticSamplers", S.NumStaticSamplers);
-  IO.mapRequired("StaticSamplersOffset", S.StaticSamplersOffset);
+  IO.mapOptional("StaticSamplersOffset", S.StaticSamplersOffset, std::nullopt);
   IO.mapRequired("Parameters", S.Parameters.Locations, S);
   IO.mapOptional("Samplers", S.StaticSamplers);
 #define ROOT_SIGNATURE_FLAG(Num, Val) IO.mapOptional(#Val, S.Val, false);
diff --git a/llvm/lib/ObjectYAML/ELFEmitter.cpp b/llvm/lib/ObjectYAML/ELFEmitter.cpp
index bc5c68d08d11..2386a2e3e6c4 100644
--- a/llvm/lib/ObjectYAML/ELFEmitter.cpp
+++ b/llvm/lib/ObjectYAML/ELFEmitter.cpp
@@ -1487,8 +1487,8 @@ void ELFState<ELFT>::writeSectionContent(
     if (!E.BBRanges)
       continue;
     uint64_t TotalNumBlocks = 0;
-    bool EmitCallsiteOffsets =
-        FeatureOrErr->CallsiteOffsets || E.hasAnyCallsiteOffsets();
+    bool EmitCallsiteEndOffsets =
+        FeatureOrErr->CallsiteEndOffsets || E.hasAnyCallsiteEndOffsets();
     for (const ELFYAML::BBAddrMapEntry::BBRangeEntry &BBR : *E.BBRanges) {
       // Write the base address of the range.
       CBA.write<uintX_t>(BBR.BaseAddress, ELFT::Endianness);
@@ -1506,12 +1506,12 @@ void ELFState<ELFT>::writeSectionContent(
         if (Section.Type == llvm::ELF::SHT_LLVM_BB_ADDR_MAP && E.Version > 1)
           SHeader.sh_size += CBA.writeULEB128(BBE.ID);
         SHeader.sh_size += CBA.writeULEB128(BBE.AddressOffset);
-        if (EmitCallsiteOffsets) {
-          size_t NumCallsiteOffsets =
-              BBE.CallsiteOffsets ? BBE.CallsiteOffsets->size() : 0;
-          SHeader.sh_size += CBA.writeULEB128(NumCallsiteOffsets);
-          if (BBE.CallsiteOffsets) {
-            for (uint32_t Offset : *BBE.CallsiteOffsets)
+        if (EmitCallsiteEndOffsets) {
+          size_t NumCallsiteEndOffsets =
+              BBE.CallsiteEndOffsets ? BBE.CallsiteEndOffsets->size() : 0;
+          SHeader.sh_size += CBA.writeULEB128(NumCallsiteEndOffsets);
+          if (BBE.CallsiteEndOffsets) {
+            for (uint32_t Offset : *BBE.CallsiteEndOffsets)
               SHeader.sh_size += CBA.writeULEB128(Offset);
           }
         }
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index c27339de67ef..62d80a24f478 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -1884,7 +1884,7 @@ void MappingTraits<ELFYAML::BBAddrMapEntry::BBEntry>::mapping(
   IO.mapRequired("AddressOffset", E.AddressOffset);
   IO.mapRequired("Size", E.Size);
   IO.mapRequired("Metadata", E.Metadata);
-  IO.mapOptional("CallsiteOffsets", E.CallsiteOffsets);
+  IO.mapOptional("CallsiteEndOffsets", E.CallsiteEndOffsets);
 }
 
 void MappingTraits<ELFYAML::PGOAnalysisMapEntry>::mapping(
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index b7edeea08276..8cf277657a54 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -69,6 +69,7 @@
 #include "llvm/Analysis/RegionInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/ScalarEvolutionDivision.h"
 #include "llvm/Analysis/ScopedNoAliasAA.h"
 #include "llvm/Analysis/StackLifetime.h"
 #include "llvm/Analysis/StackSafetyAnalysis.h"
@@ -184,6 +185,7 @@
 #include "llvm/IR/Verifier.h"
 #include "llvm/IRPrinter/IRPrintingPasses.h"
 #include "llvm/Passes/OptimizationLevel.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -1491,6 +1493,27 @@ parseBoundsCheckingOptions(StringRef Params) {
   return Options;
 }
 
+Expected<CodeGenOptLevel> parseExpandFpOptions(StringRef Param) {
+  if (Param.empty())
+    return CodeGenOptLevel::None;
+
+  // Parse a CodeGenOptLevel, e.g. "O1", "O2", "O3".
+  auto [Prefix, Digit] = Param.split('O');
+
+  uint8_t N;
+  if (!Prefix.empty() || Digit.getAsInteger(10, N))
+    return createStringError("invalid expand-fp pass parameter '%s'",
+                             Param.str().c_str());
+
+  std::optional<CodeGenOptLevel> Level = CodeGenOpt::getLevel(N);
+  if (!Level.has_value())
+    return createStringError(
+        "invalid optimization level for expand-fp pass: %s",
+        Digit.str().c_str());
+
+  return *Level;
+}
+
 Expected<RAGreedyPass::Options>
 parseRegAllocGreedyFilterFunc(PassBuilder &PB, StringRef Params) {
   if (Params.empty() || Params == "all")
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 1b111dc20d35..1d015971dfbd 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -426,7 +426,6 @@ FUNCTION_PASS("dot-post-dom-only", PostDomOnlyPrinter())
 FUNCTION_PASS("dse", DSEPass())
 FUNCTION_PASS("dwarf-eh-prepare", DwarfEHPreparePass(TM))
 FUNCTION_PASS("expand-large-div-rem", ExpandLargeDivRemPass(TM))
-FUNCTION_PASS("expand-fp", ExpandFpPass(TM))
 FUNCTION_PASS("expand-memcmp", ExpandMemCmpPass(TM))
 FUNCTION_PASS("expand-reductions", ExpandReductionsPass())
 FUNCTION_PASS("extra-vector-passes",
@@ -518,6 +517,7 @@ FUNCTION_PASS("print<phi-values>", PhiValuesPrinterPass(errs()))
 FUNCTION_PASS("print<postdomtree>", PostDominatorTreePrinterPass(errs()))
 FUNCTION_PASS("print<regions>", RegionInfoPrinterPass(errs()))
 FUNCTION_PASS("print<scalar-evolution>", ScalarEvolutionPrinterPass(errs()))
+FUNCTION_PASS("print<scev-division>", SCEVDivisionPrinterPass(errs()))
 FUNCTION_PASS("print<stack-safety-local>", StackSafetyPrinterPass(errs()))
 FUNCTION_PASS("print<uniformity>", UniformityInfoPrinterPass(errs()))
 FUNCTION_PASS("prof-inject", ProfileInjectorPass())
@@ -719,6 +719,13 @@ FUNCTION_PASS_WITH_PARAMS(
     },
     parseBoundsCheckingOptions,
     "trap;rt;rt-abort;min-rt;min-rt-abort;merge;guard=N")
+FUNCTION_PASS_WITH_PARAMS(
+    "expand-fp", "ExpandFpPass",
+    [TM = TM](CodeGenOptLevel OL) {
+      return ExpandFpPass(TM, OL);
+    },
+    parseExpandFpOptions, "O0;O1;O2;O3")
+    
 #undef FUNCTION_PASS_WITH_PARAMS
 
 #ifndef LOOPNEST_PASS
diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp
index f165e85baf61..de293308ae69 100644
--- a/llvm/lib/Passes/StandardInstrumentations.cpp
+++ b/llvm/lib/Passes/StandardInstrumentations.cpp
@@ -118,15 +118,15 @@ static cl::opt<bool> PrintPassNumbers(
     "print-pass-numbers", cl::init(false), cl::Hidden,
     cl::desc("Print pass names and their ordinals"));
 
-static cl::opt<unsigned> PrintBeforePassNumber(
-    "print-before-pass-number", cl::init(0), cl::Hidden,
-    cl::desc("Print IR before the pass with this number as "
+static cl::list<unsigned> PrintBeforePassNumber(
+    "print-before-pass-number", cl::CommaSeparated, cl::Hidden,
+    cl::desc("Print IR before the passes with specified numbers as "
              "reported by print-pass-numbers"));
 
-static cl::opt<unsigned>
-    PrintAfterPassNumber("print-after-pass-number", cl::init(0), cl::Hidden,
-                         cl::desc("Print IR after the pass with this number as "
-                                  "reported by print-pass-numbers"));
+static cl::list<unsigned> PrintAfterPassNumber(
+    "print-after-pass-number", cl::CommaSeparated, cl::Hidden,
+    cl::desc("Print IR after the passes with specified numbers as "
+             "reported by print-pass-numbers"));
 
 static cl::opt<std::string> IRDumpDirectory(
     "ir-dump-directory",
@@ -984,12 +984,12 @@ bool PrintIRInstrumentation::shouldPrintAfterPass(StringRef PassID) {
 
 bool PrintIRInstrumentation::shouldPrintBeforeCurrentPassNumber() {
   return shouldPrintBeforeSomePassNumber() &&
-         (CurrentPassNumber == PrintBeforePassNumber);
+         (is_contained(PrintBeforePassNumber, CurrentPassNumber));
 }
 
 bool PrintIRInstrumentation::shouldPrintAfterCurrentPassNumber() {
   return shouldPrintAfterSomePassNumber() &&
-         (CurrentPassNumber == PrintAfterPassNumber);
+         (is_contained(PrintAfterPassNumber, CurrentPassNumber));
 }
 
 bool PrintIRInstrumentation::shouldPrintPassNumbers() {
@@ -997,11 +997,11 @@ bool PrintIRInstrumentation::shouldPrintPassNumbers() {
 }
 
 bool PrintIRInstrumentation::shouldPrintBeforeSomePassNumber() {
-  return PrintBeforePassNumber > 0;
+  return !PrintBeforePassNumber.empty();
 }
 
 bool PrintIRInstrumentation::shouldPrintAfterSomePassNumber() {
-  return PrintAfterPassNumber > 0;
+  return !PrintAfterPassNumber.empty();
 }
 
 void PrintIRInstrumentation::registerCallbacks(
diff --git a/llvm/lib/Remarks/BitstreamRemarkParser.cpp b/llvm/lib/Remarks/BitstreamRemarkParser.cpp
index 312886013598..86a6c6dffb18 100644
--- a/llvm/lib/Remarks/BitstreamRemarkParser.cpp
+++ b/llvm/lib/Remarks/BitstreamRemarkParser.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Remarks/BitstreamRemarkParser.h"
 #include "BitstreamRemarkParser.h"
 #include "llvm/Remarks/Remark.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -600,3 +599,5 @@ BitstreamRemarkParser::processRemark(BitstreamRemarkParserHelper &Helper) {
 
   return std::move(Result);
 }
+llvm::remarks::BitstreamRemarkParser::BitstreamRemarkParser(StringRef Buf)
+    : RemarkParser(Format::Bitstream), ParserHelper(Buf) {}
diff --git a/llvm/lib/Remarks/BitstreamRemarkParser.h b/llvm/lib/Remarks/BitstreamRemarkParser.h
index f6f79ef199f7..cba805dc24b5 100644
--- a/llvm/lib/Remarks/BitstreamRemarkParser.h
+++ b/llvm/lib/Remarks/BitstreamRemarkParser.h
@@ -13,10 +13,14 @@
 #ifndef LLVM_LIB_REMARKS_BITSTREAM_REMARK_PARSER_H
 #define LLVM_LIB_REMARKS_BITSTREAM_REMARK_PARSER_H
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Bitstream/BitstreamReader.h"
 #include "llvm/Remarks/BitstreamRemarkContainer.h"
-#include "llvm/Remarks/BitstreamRemarkParser.h"
 #include "llvm/Remarks/RemarkFormat.h"
 #include "llvm/Remarks/RemarkParser.h"
+#include "llvm/Support/Error.h"
+#include <array>
 #include <cstdint>
 #include <memory>
 #include <optional>
@@ -26,6 +30,91 @@ namespace remarks {
 
 struct Remark;
 
+/// Helper to parse a META_BLOCK for a bitstream remark container.
+struct BitstreamMetaParserHelper {
+  /// The Bitstream reader.
+  BitstreamCursor &Stream;
+  /// Reference to the storage for the block info.
+  BitstreamBlockInfo &BlockInfo;
+  /// The parsed content: depending on the container type, some fields might be
+  /// empty.
+  std::optional<uint64_t> ContainerVersion;
+  std::optional<uint8_t> ContainerType;
+  std::optional<StringRef> StrTabBuf;
+  std::optional<StringRef> ExternalFilePath;
+  std::optional<uint64_t> RemarkVersion;
+
+  /// Continue parsing with \p Stream. \p Stream is expected to contain a
+  /// ENTER_SUBBLOCK to the META_BLOCK at the current position.
+  /// \p Stream is expected to have a BLOCKINFO_BLOCK set.
+  BitstreamMetaParserHelper(BitstreamCursor &Stream,
+                            BitstreamBlockInfo &BlockInfo);
+
+  /// Parse the META_BLOCK and fill the available entries.
+  /// This helper does not check for the validity of the fields.
+  Error parse();
+};
+
+/// Helper to parse a REMARK_BLOCK for a bitstream remark container.
+struct BitstreamRemarkParserHelper {
+  /// The Bitstream reader.
+  BitstreamCursor &Stream;
+  /// The parsed content: depending on the remark, some fields might be empty.
+  std::optional<uint8_t> Type;
+  std::optional<uint64_t> RemarkNameIdx;
+  std::optional<uint64_t> PassNameIdx;
+  std::optional<uint64_t> FunctionNameIdx;
+  std::optional<uint64_t> SourceFileNameIdx;
+  std::optional<uint32_t> SourceLine;
+  std::optional<uint32_t> SourceColumn;
+  std::optional<uint64_t> Hotness;
+  struct Argument {
+    std::optional<uint64_t> KeyIdx;
+    std::optional<uint64_t> ValueIdx;
+    std::optional<uint64_t> SourceFileNameIdx;
+    std::optional<uint32_t> SourceLine;
+    std::optional<uint32_t> SourceColumn;
+  };
+  std::optional<ArrayRef<Argument>> Args;
+  /// Avoid re-allocating a vector every time.
+  SmallVector<Argument, 8> TmpArgs;
+
+  /// Continue parsing with \p Stream. \p Stream is expected to contain a
+  /// ENTER_SUBBLOCK to the REMARK_BLOCK at the current position.
+  /// \p Stream is expected to have a BLOCKINFO_BLOCK set and to have already
+  /// parsed the META_BLOCK.
+  BitstreamRemarkParserHelper(BitstreamCursor &Stream);
+
+  /// Parse the REMARK_BLOCK and fill the available entries.
+  /// This helper does not check for the validity of the fields.
+  Error parse();
+};
+
+/// Helper to parse any bitstream remark container.
+struct BitstreamParserHelper {
+  /// The Bitstream reader.
+  BitstreamCursor Stream;
+  /// The block info block.
+  BitstreamBlockInfo BlockInfo;
+  /// Start parsing at \p Buffer.
+  BitstreamParserHelper(StringRef Buffer);
+  /// Parse the magic number.
+  Expected<std::array<char, 4>> parseMagic();
+  /// Parse the block info block containing all the abbrevs.
+  /// This needs to be called before calling any other parsing function.
+  Error parseBlockInfoBlock();
+  /// Return true if the next block is a META_BLOCK. This function does not move
+  /// the cursor.
+  Expected<bool> isMetaBlock();
+  /// Return true if the next block is a REMARK_BLOCK. This function does not
+  /// move the cursor.
+  Expected<bool> isRemarkBlock();
+  /// Return true if the parser reached the end of the stream.
+  bool atEndOfStream() { return Stream.AtEndOfStream(); }
+  /// Jump to the end of the stream, skipping everything.
+  void skipToEnd() { return Stream.skipToEnd(); }
+};
+
 /// Parses and holds the state of the latest parsed remark.
 struct BitstreamRemarkParser : public RemarkParser {
   /// The buffer to parse.
@@ -45,8 +134,7 @@ struct BitstreamRemarkParser : public RemarkParser {
 
   /// Create a parser that expects to find a string table embedded in the
   /// stream.
-  explicit BitstreamRemarkParser(StringRef Buf)
-      : RemarkParser(Format::Bitstream), ParserHelper(Buf) {}
+  explicit BitstreamRemarkParser(StringRef Buf);
 
   Expected<std::unique_ptr<Remark>> next() override;
 
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index aa5b3c78ea5f..d14abb4bd05b 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -2927,51 +2927,6 @@ APFloat::opStatus IEEEFloat::convertFromAPInt(const APInt &Val, bool isSigned,
   return convertFromUnsignedParts(api.getRawData(), partCount, rounding_mode);
 }
 
-/* Convert a two's complement integer SRC to a floating point number,
-   rounding according to ROUNDING_MODE.  ISSIGNED is true if the
-   integer is signed, in which case it must be sign-extended.  */
-APFloat::opStatus
-IEEEFloat::convertFromSignExtendedInteger(const integerPart *src,
-                                          unsigned int srcCount, bool isSigned,
-                                          roundingMode rounding_mode) {
-  opStatus status;
-
-  if (isSigned &&
-      APInt::tcExtractBit(src, srcCount * integerPartWidth - 1)) {
-    integerPart *copy;
-
-    /* If we're signed and negative negate a copy.  */
-    sign = true;
-    copy = new integerPart[srcCount];
-    APInt::tcAssign(copy, src, srcCount);
-    APInt::tcNegate(copy, srcCount);
-    status = convertFromUnsignedParts(copy, srcCount, rounding_mode);
-    delete [] copy;
-  } else {
-    sign = false;
-    status = convertFromUnsignedParts(src, srcCount, rounding_mode);
-  }
-
-  return status;
-}
-
-/* FIXME: should this just take a const APInt reference?  */
-APFloat::opStatus
-IEEEFloat::convertFromZeroExtendedInteger(const integerPart *parts,
-                                          unsigned int width, bool isSigned,
-                                          roundingMode rounding_mode) {
-  unsigned int partCount = partCountForBits(width);
-  APInt api = APInt(width, ArrayRef(parts, partCount));
-
-  sign = false;
-  if (isSigned && APInt::tcExtractBit(parts, width - 1)) {
-    sign = true;
-    api = -api;
-  }
-
-  return convertFromUnsignedParts(api.getRawData(), partCount, rounding_mode);
-}
-
 Expected<APFloat::opStatus>
 IEEEFloat::convertFromHexadecimalString(StringRef s,
                                         roundingMode rounding_mode) {
@@ -5648,36 +5603,158 @@ DoubleAPFloat::convertToInteger(MutableArrayRef<integerPart> Input,
   return FS;
 }
 
-APFloat::opStatus DoubleAPFloat::convertFromAPInt(const APInt &Input,
-                                                  bool IsSigned,
-                                                  roundingMode RM) {
-  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
-  APFloat Tmp(semPPCDoubleDoubleLegacy);
-  auto Ret = Tmp.convertFromAPInt(Input, IsSigned, RM);
-  *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt());
-  return Ret;
-}
+APFloat::opStatus DoubleAPFloat::handleOverflow(roundingMode RM) {
+  switch (RM) {
+  case APFloat::rmTowardZero:
+    makeLargest(/*Neg=*/isNegative());
+    break;
+  case APFloat::rmTowardNegative:
+    if (isNegative())
+      makeInf(/*Neg=*/true);
+    else
+      makeLargest(/*Neg=*/false);
+    break;
+  case APFloat::rmTowardPositive:
+    if (isNegative())
+      makeLargest(/*Neg=*/true);
+    else
+      makeInf(/*Neg=*/false);
+    break;
+  case APFloat::rmNearestTiesToAway:
+  case APFloat::rmNearestTiesToEven:
+    makeInf(/*Neg=*/isNegative());
+    break;
+  default:
+    llvm_unreachable("Invalid rounding mode found");
+  }
+  opStatus S = opInexact;
+  if (!getFirst().isFinite())
+    S = static_cast<opStatus>(S | opOverflow);
+  return S;
+}
+
+APFloat::opStatus DoubleAPFloat::convertFromUnsignedParts(
+    const integerPart *Src, unsigned int SrcCount, roundingMode RM) {
+  // Find the most significant bit of the source integer. APInt::tcMSB returns
+  // UINT_MAX for a zero value.
+  const unsigned SrcMSB = APInt::tcMSB(Src, SrcCount);
+  if (SrcMSB == UINT_MAX) {
+    // The source integer is 0.
+    makeZero(/*Neg=*/false);
+    return opOK;
+  }
 
-APFloat::opStatus
-DoubleAPFloat::convertFromSignExtendedInteger(const integerPart *Input,
-                                              unsigned int InputSize,
-                                              bool IsSigned, roundingMode RM) {
-  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
-  APFloat Tmp(semPPCDoubleDoubleLegacy);
-  auto Ret = Tmp.convertFromSignExtendedInteger(Input, InputSize, IsSigned, RM);
-  *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt());
-  return Ret;
+  // Create a minimally-sized APInt to represent the source value.
+  const unsigned SrcBitWidth = SrcMSB + 1;
+  APSInt SrcInt{APInt{/*numBits=*/SrcBitWidth,
+                      /*numWords=*/SrcCount, Src},
+                /*isUnsigned=*/true};
+
+  // Stage 1: Initial Approximation.
+  // Convert the source integer SrcInt to the Hi part of the DoubleAPFloat.
+  // We use round-to-nearest because it minimizes the initial error, which is
+  // crucial for the subsequent steps.
+  APFloat Hi{getFirst().getSemantics()};
+  Hi.convertFromAPInt(SrcInt, /*IsSigned=*/false, rmNearestTiesToEven);
+
+  // If the first approximation already overflows, the number is too large.
+  // NOTE: The underlying semantics are *more* conservative when choosing to
+  // overflow because their notion of ULP is much larger. As such, it is always
+  // safe to overflow at the DoubleAPFloat level if the APFloat overflows.
+  if (!Hi.isFinite())
+    return handleOverflow(RM);
+
+  // Stage 2: Exact Error Calculation.
+  // Calculate the exact error of the first approximation: Error = SrcInt - Hi.
+  // This is done by converting Hi back to an integer and subtracting it from
+  // the original source.
+  bool HiAsIntIsExact;
+  // Create an integer representation of Hi. Its width is determined by the
+  // exponent of Hi, ensuring it's just large enough. This width can exceed
+  // SrcBitWidth if the conversion to Hi rounded up to a power of two.
+  // accurately when converted back to an integer.
+  APSInt HiAsInt{static_cast<uint32_t>(ilogb(Hi) + 1), /*isUnsigned=*/true};
+  Hi.convertToInteger(HiAsInt, rmNearestTiesToEven, &HiAsIntIsExact);
+  const APInt Error = SrcInt.zext(HiAsInt.getBitWidth()) - HiAsInt;
+
+  // Stage 3: Error Approximation and Rounding.
+  // Convert the integer error into the Lo part of the DoubleAPFloat. This step
+  // captures the remainder of the original number. The rounding mode for this
+  // conversion (LoRM) may need to be adjusted from the user-requested RM to
+  // ensure the final sum (Hi + Lo) rounds correctly.
+  roundingMode LoRM = RM;
+  // Adjustments are only necessary when the initial approximation Hi was an
+  // overestimate, making the Error negative.
+  if (Error.isNegative()) {
+    if (RM == rmNearestTiesToAway) {
+      // For rmNearestTiesToAway, a tie should round away from zero. Since
+      // SrcInt is positive, this means rounding toward +infinity.
+      // A standard conversion of a negative Error would round ties toward
+      // -infinity, causing the final sum Hi + Lo to be smaller. To
+      // counteract this, we detect the tie case and override the rounding
+      // mode for Lo to rmTowardPositive.
+      const unsigned ErrorActiveBits = Error.getSignificantBits() - 1;
+      const unsigned LoPrecision = getSecond().getSemantics().precision;
+      if (ErrorActiveBits > LoPrecision) {
+        const unsigned RoundingBoundary = ErrorActiveBits - LoPrecision;
+        // A tie occurs when the bits to be truncated are of the form 100...0.
+        // This is detected by checking if the number of trailing zeros is
+        // exactly one less than the number of bits being truncated.
+        if (Error.countTrailingZeros() == RoundingBoundary - 1)
+          LoRM = rmTowardPositive;
+      }
+    } else if (RM == rmTowardZero) {
+      // For rmTowardZero, the final positive result must be truncated (rounded
+      // down). When Hi is an overestimate, Error is negative. A standard
+      // rmTowardZero conversion of Error would make it *less* negative,
+      // effectively rounding the final sum Hi + Lo *up*. To ensure the sum
+      // rounds down correctly, we force Lo to round toward -infinity.
+      LoRM = rmTowardNegative;
+    }
+  }
+
+  APFloat Lo{getSecond().getSemantics()};
+  opStatus Status = Lo.convertFromAPInt(Error, /*IsSigned=*/true, LoRM);
+
+  // Renormalize the pair (Hi, Lo) into a canonical DoubleAPFloat form where the
+  // components do not overlap. fastTwoSum performs this operation.
+  std::tie(Hi, Lo) = fastTwoSum(Hi, Lo);
+  Floats[0] = std::move(Hi);
+  Floats[1] = std::move(Lo);
+
+  // A final check for overflow is needed because fastTwoSum can cause a
+  // carry-out from Lo that pushes Hi to infinity.
+  if (!getFirst().isFinite())
+    return handleOverflow(RM);
+
+  // The largest DoubleAPFloat must be canonical. Values which are larger are
+  // not canonical and are equivalent to overflow.
+  if (getFirst().isFiniteNonZero() && Floats[0].isLargest()) {
+    DoubleAPFloat Largest{*Semantics};
+    Largest.makeLargest(/*Neg=*/false);
+    if (compare(Largest) == APFloat::cmpGreaterThan)
+      return handleOverflow(RM);
+  }
+
+  // The final status of the operation is determined by the conversion of the
+  // error term. If Lo could represent Error exactly, the entire conversion
+  // is exact. Otherwise, it's inexact.
+  return Status;
 }
 
-APFloat::opStatus
-DoubleAPFloat::convertFromZeroExtendedInteger(const integerPart *Input,
-                                              unsigned int InputSize,
-                                              bool IsSigned, roundingMode RM) {
-  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
-  APFloat Tmp(semPPCDoubleDoubleLegacy);
-  auto Ret = Tmp.convertFromZeroExtendedInteger(Input, InputSize, IsSigned, RM);
-  *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt());
-  return Ret;
+APFloat::opStatus DoubleAPFloat::convertFromAPInt(const APInt &Input,
+                                                  bool IsSigned,
+                                                  roundingMode RM) {
+  const bool NegateInput = IsSigned && Input.isNegative();
+  APInt API = Input;
+  if (NegateInput)
+    API.negate();
+
+  const APFloat::opStatus Status =
+      convertFromUnsignedParts(API.getRawData(), API.getNumWords(), RM);
+  if (NegateInput)
+    changeSign();
+  return Status;
 }
 
 unsigned int DoubleAPFloat::convertToHexString(char *DST,
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index 6646af6db5d3..2528e8bd1142 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -265,18 +265,23 @@ add_llvm_component_library(LLVMSupport
   ToolOutputFile.cpp
   TrieRawHashMap.cpp
   Twine.cpp
-  TypeSize.cpp
   Unicode.cpp
   UnicodeCaseFold.cpp
   UnicodeNameToCodepoint.cpp
   UnicodeNameToCodepointGenerated.cpp
   VersionTuple.cpp
   VirtualFileSystem.cpp
+  VirtualOutputBackend.cpp
+  VirtualOutputBackends.cpp
+  VirtualOutputConfig.cpp
+  VirtualOutputError.cpp
+  VirtualOutputFile.cpp
   WithColor.cpp
   YAMLParser.cpp
   YAMLTraits.cpp
   raw_os_ostream.cpp
   raw_ostream.cpp
+  raw_ostream_proxy.cpp
   raw_socket_stream.cpp
   regcomp.c
   regerror.c
diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp
index 8491633df97e..be232f5bff58 100644
--- a/llvm/lib/Support/CommandLine.cpp
+++ b/llvm/lib/Support/CommandLine.cpp
@@ -2671,7 +2671,6 @@ static void initCommonOptions() {
   initSignalsOptions();
   initStatisticOptions();
   initTimerOptions();
-  initTypeSizeOptions();
   initWithColorOptions();
   initDebugOptions();
   initRandomSeedOptions();
diff --git a/llvm/lib/Support/DebugOptions.h b/llvm/lib/Support/DebugOptions.h
index db727d5a584c..6c3382e8f858 100644
--- a/llvm/lib/Support/DebugOptions.h
+++ b/llvm/lib/Support/DebugOptions.h
@@ -24,7 +24,6 @@ void initGraphWriterOptions();
 void initSignalsOptions();
 void initStatisticOptions();
 void initTimerOptions();
-void initTypeSizeOptions();
 void initWithColorOptions();
 void initDebugOptions();
 void initRandomSeedOptions();
diff --git a/llvm/lib/Support/JSON.cpp b/llvm/lib/Support/JSON.cpp
index d8662340cb3e..4652c0740dc4 100644
--- a/llvm/lib/Support/JSON.cpp
+++ b/llvm/lib/Support/JSON.cpp
@@ -84,16 +84,7 @@ json::Array *Object::getArray(StringRef K) {
     return V->getAsArray();
   return nullptr;
 }
-bool operator==(const Object &LHS, const Object &RHS) {
-  if (LHS.size() != RHS.size())
-    return false;
-  for (const auto &L : LHS) {
-    auto R = RHS.find(L.first);
-    if (R == RHS.end() || L.second != R->second)
-      return false;
-  }
-  return true;
-}
+bool operator==(const Object &LHS, const Object &RHS) { return LHS.M == RHS.M; }
 
 Array::Array(std::initializer_list<Value> Elements) {
   V.reserve(Elements.size());
diff --git a/llvm/lib/Support/KnownBits.cpp b/llvm/lib/Support/KnownBits.cpp
index bd08365a3fcd..8d91f0e95d22 100644
--- a/llvm/lib/Support/KnownBits.cpp
+++ b/llvm/lib/Support/KnownBits.cpp
@@ -372,8 +372,7 @@ KnownBits KnownBits::lshr(const KnownBits &LHS, const KnownBits &RHS,
   unsigned BitWidth = LHS.getBitWidth();
   auto ShiftByConst = [&](const KnownBits &LHS, unsigned ShiftAmt) {
     KnownBits Known = LHS;
-    Known.Zero.lshrInPlace(ShiftAmt);
-    Known.One.lshrInPlace(ShiftAmt);
+    Known >>= ShiftAmt;
     // High bits are known zero.
     Known.Zero.setHighBits(ShiftAmt);
     return Known;
diff --git a/llvm/lib/Support/Parallel.cpp b/llvm/lib/Support/Parallel.cpp
index 2ba02b73dd8f..3ac6fc74fd3e 100644
--- a/llvm/lib/Support/Parallel.cpp
+++ b/llvm/lib/Support/Parallel.cpp
@@ -60,7 +60,7 @@ public:
     auto &Thread0 = Threads[0];
     Thread0 = std::thread([this, S] {
       for (unsigned I = 1; I < ThreadCount; ++I) {
-        Threads.emplace_back([=] { work(S, I); });
+        Threads.emplace_back([this, S, I] { work(S, I); });
         if (Stop)
           break;
       }
diff --git a/llvm/lib/Support/PrettyStackTrace.cpp b/llvm/lib/Support/PrettyStackTrace.cpp
index 26e22161b605..82b0e6ac513e 100644
--- a/llvm/lib/Support/PrettyStackTrace.cpp
+++ b/llvm/lib/Support/PrettyStackTrace.cpp
@@ -39,7 +39,7 @@ using namespace llvm;
 
 static const char *BugReportMsg =
     "PLEASE submit a bug report to " BUG_REPORT_URL
-    " and include the crash backtrace.\n";
+    " and include the crash backtrace and instructions to reproduce the bug.\n";
 
 // If backtrace support is not enabled, compile out support for pretty stack
 // traces.  This has the secondary effect of not requiring thread local storage
diff --git a/llvm/lib/Support/Twine.cpp b/llvm/lib/Support/Twine.cpp
index 495b9cf2dbd6..d6b48166fb0f 100644
--- a/llvm/lib/Support/Twine.cpp
+++ b/llvm/lib/Support/Twine.cpp
@@ -56,11 +56,12 @@ StringRef Twine::toNullTerminatedStringRef(SmallVectorImpl<char> &Out) const {
   return StringRef(Out.data(), Out.size());
 }
 
-void Twine::printOneChild(raw_ostream &OS, Child Ptr,
-                          NodeKind Kind) const {
+void Twine::printOneChild(raw_ostream &OS, Child Ptr, NodeKind Kind) const {
   switch (Kind) {
-  case Twine::NullKind: break;
-  case Twine::EmptyKind: break;
+  case Twine::NullKind:
+    break;
+  case Twine::EmptyKind:
+    break;
   case Twine::TwineKind:
     Ptr.twine->print(OS);
     break;
@@ -104,24 +105,23 @@ void Twine::printOneChild(raw_ostream &OS, Child Ptr,
   }
 }
 
-void Twine::printOneChildRepr(raw_ostream &OS, Child Ptr,
-                              NodeKind Kind) const {
+void Twine::printOneChildRepr(raw_ostream &OS, Child Ptr, NodeKind Kind) const {
   switch (Kind) {
   case Twine::NullKind:
-    OS << "null"; break;
+    OS << "null";
+    break;
   case Twine::EmptyKind:
-    OS << "empty"; break;
+    OS << "empty";
+    break;
   case Twine::TwineKind:
     OS << "rope:";
     Ptr.twine->printRepr(OS);
     break;
   case Twine::CStringKind:
-    OS << "cstring:\""
-       << Ptr.cString << "\"";
+    OS << "cstring:\"" << Ptr.cString << "\"";
     break;
   case Twine::StdStringKind:
-    OS << "std::string:\""
-       << Ptr.stdString << "\"";
+    OS << "std::string:\"" << Ptr.stdString << "\"";
     break;
   case Twine::PtrAndLengthKind:
     OS << "ptrAndLength:\""
@@ -175,11 +175,7 @@ void Twine::printRepr(raw_ostream &OS) const {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD void Twine::dump() const {
-  print(dbgs());
-}
+LLVM_DUMP_METHOD void Twine::dump() const { print(dbgs()); }
 
-LLVM_DUMP_METHOD void Twine::dumpRepr() const {
-  printRepr(dbgs());
-}
+LLVM_DUMP_METHOD void Twine::dumpRepr() const { printRepr(dbgs()); }
 #endif
diff --git a/llvm/lib/Support/TypeSize.cpp b/llvm/lib/Support/TypeSize.cpp
deleted file mode 100644
index 43346b81cd67..000000000000
--- a/llvm/lib/Support/TypeSize.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-//===- TypeSize.cpp - Wrapper around type sizes------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Support/TypeSize.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/WithColor.h"
-
-#include "DebugOptions.h"
-
-using namespace llvm;
-
-#ifndef STRICT_FIXED_SIZE_VECTORS
-namespace {
-struct CreateScalableErrorAsWarning {
-  /// The ScalableErrorAsWarning is a temporary measure to suppress errors from
-  /// using the wrong interface on a scalable vector.
-  static void *call() {
-    return new cl::opt<bool>(
-        "treat-scalable-fixed-error-as-warning", cl::Hidden,
-        cl::desc(
-            "Treat issues where a fixed-width property is requested from a "
-            "scalable type as a warning, instead of an error"));
-  }
-};
-} // namespace
-static ManagedStatic<cl::opt<bool>, CreateScalableErrorAsWarning>
-    ScalableErrorAsWarning;
-void llvm::initTypeSizeOptions() { *ScalableErrorAsWarning; }
-#else
-void llvm::initTypeSizeOptions() {}
-#endif
-
-void llvm::reportInvalidSizeRequest(const char *Msg) {
-#ifndef STRICT_FIXED_SIZE_VECTORS
-  if (*ScalableErrorAsWarning) {
-    WithColor::warning() << "Invalid size request on a scalable vector; " << Msg
-                         << "\n";
-    return;
-  }
-#endif
-  report_fatal_error("Invalid size request on a scalable vector.");
-}
-
-TypeSize::operator TypeSize::ScalarTy() const {
-  if (isScalable()) {
-    reportInvalidSizeRequest(
-        "Cannot implicitly convert a scalable size to a fixed-width size in "
-        "`TypeSize::operator ScalarTy()`");
-    return getKnownMinValue();
-  }
-  return getFixedValue();
-}
diff --git a/llvm/lib/Support/Unix/Signals.inc b/llvm/lib/Support/Unix/Signals.inc
index 6cd38aabc734..573ad82f2dea 100644
--- a/llvm/lib/Support/Unix/Signals.inc
+++ b/llvm/lib/Support/Unix/Signals.inc
@@ -883,8 +883,7 @@ void llvm::sys::PrintStackTrace(raw_ostream &OS, int Depth) {
     } else {
       const char *name = strrchr(dlinfo.dli_fname, '/');
       if (!name)
-        OS << format(" %-*s", width,
-                     static_cast<const char *>(dlinfo.dli_fname));
+        OS << format(" %-*s", width, dlinfo.dli_fname);
       else
         OS << format(" %-*s", width, name + 1);
     }
diff --git a/llvm/lib/Support/Unix/Threading.inc b/llvm/lib/Support/Unix/Threading.inc
index 7854d6d22915..f016ed693752 100644
--- a/llvm/lib/Support/Unix/Threading.inc
+++ b/llvm/lib/Support/Unix/Threading.inc
@@ -194,9 +194,9 @@ void llvm::set_thread_name(const Twine &Name) {
   if (get_max_thread_name_length() > 0)
     NameStr = NameStr.take_back(get_max_thread_name_length() - 1);
   (void)NameStr;
-#if defined(HAVE_PTHREAD_SET_NAME_NP)
+#if defined(HAVE_PTHREAD_SET_NAME_NP) && HAVE_PTHREAD_SET_NAME_NP
   ::pthread_set_name_np(::pthread_self(), NameStr.data());
-#elif defined(HAVE_PTHREAD_SETNAME_NP)
+#elif defined(HAVE_PTHREAD_SETNAME_NP) && HAVE_PTHREAD_SETNAME_NP
 #if defined(__NetBSD__)
   ::pthread_setname_np(::pthread_self(), "%s",
                        const_cast<char *>(NameStr.data()));
diff --git a/llvm/lib/Support/VirtualOutputBackend.cpp b/llvm/lib/Support/VirtualOutputBackend.cpp
new file mode 100644
index 000000000000..97dab054dfa0
--- /dev/null
+++ b/llvm/lib/Support/VirtualOutputBackend.cpp
@@ -0,0 +1,40 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements \c vfs::OutputBackend class methods.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/VirtualOutputBackend.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/VirtualOutputError.h"
+
+using namespace llvm;
+using namespace llvm::vfs;
+
+void OutputBackend::anchor() {}
+
+Expected<OutputFile>
+OutputBackend::createFile(const Twine &Path,
+                          std::optional<OutputConfig> Config) {
+  SmallString<128> PathStorage;
+  Path.toVector(PathStorage);
+
+  if (Config) {
+    // Check for invalid configs.
+    if (!Config->getText() && Config->getCRLF())
+      return make_error<OutputConfigError>(*Config, PathStorage);
+  }
+
+  std::unique_ptr<OutputFileImpl> Impl;
+  if (Error E = createFileImpl(PathStorage, Config).moveInto(Impl))
+    return std::move(E);
+  assert(Impl && "Expected valid Impl or Error");
+  return OutputFile(PathStorage, std::move(Impl));
+}
diff --git a/llvm/lib/Support/VirtualOutputBackends.cpp b/llvm/lib/Support/VirtualOutputBackends.cpp
new file mode 100644
index 000000000000..d6d7b8715bd4
--- /dev/null
+++ b/llvm/lib/Support/VirtualOutputBackends.cpp
@@ -0,0 +1,598 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the VirtualOutputBackend types, including:
+/// * NullOutputBackend: Outputs to NullOutputBackend are discarded.
+/// * FilteringOutputBackend: Filter paths from output.
+/// * MirroringOutputBackend: Mirror the output into two different backend.
+/// * OnDiskOutputBackend: Write output files to disk.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/VirtualOutputBackends.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/LockFileManager.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/VirtualOutputConfig.h"
+#include "llvm/Support/VirtualOutputError.h"
+
+using namespace llvm;
+using namespace llvm::vfs;
+
+void ProxyOutputBackend::anchor() {}
+void OnDiskOutputBackend::anchor() {}
+
+IntrusiveRefCntPtr<OutputBackend> vfs::makeNullOutputBackend() {
+  struct NullOutputBackend : public OutputBackend {
+    IntrusiveRefCntPtr<OutputBackend> cloneImpl() const override {
+      return const_cast<NullOutputBackend *>(this);
+    }
+    Expected<std::unique_ptr<OutputFileImpl>>
+    createFileImpl(StringRef Path, std::optional<OutputConfig>) override {
+      return std::make_unique<NullOutputFileImpl>();
+    }
+  };
+
+  return makeIntrusiveRefCnt<NullOutputBackend>();
+}
+
+IntrusiveRefCntPtr<OutputBackend> vfs::makeFilteringOutputBackend(
+    IntrusiveRefCntPtr<OutputBackend> UnderlyingBackend,
+    std::function<bool(StringRef, std::optional<OutputConfig>)> Filter) {
+  struct FilteringOutputBackend : public ProxyOutputBackend {
+    Expected<std::unique_ptr<OutputFileImpl>>
+    createFileImpl(StringRef Path,
+                   std::optional<OutputConfig> Config) override {
+      if (Filter(Path, Config))
+        return ProxyOutputBackend::createFileImpl(Path, Config);
+      return std::make_unique<NullOutputFileImpl>();
+    }
+
+    IntrusiveRefCntPtr<OutputBackend> cloneImpl() const override {
+      return makeIntrusiveRefCnt<FilteringOutputBackend>(
+          getUnderlyingBackend().clone(), Filter);
+    }
+
+    FilteringOutputBackend(
+        IntrusiveRefCntPtr<OutputBackend> UnderlyingBackend,
+        std::function<bool(StringRef, std::optional<OutputConfig>)> Filter)
+        : ProxyOutputBackend(std::move(UnderlyingBackend)),
+          Filter(std::move(Filter)) {
+      assert(this->Filter && "Expected a non-null function");
+    }
+    std::function<bool(StringRef, std::optional<OutputConfig>)> Filter;
+  };
+
+  return makeIntrusiveRefCnt<FilteringOutputBackend>(
+      std::move(UnderlyingBackend), std::move(Filter));
+}
+
+IntrusiveRefCntPtr<OutputBackend>
+vfs::makeMirroringOutputBackend(IntrusiveRefCntPtr<OutputBackend> Backend1,
+                                IntrusiveRefCntPtr<OutputBackend> Backend2) {
+  struct ProxyOutputBackend1 : public ProxyOutputBackend {
+    using ProxyOutputBackend::ProxyOutputBackend;
+  };
+  struct ProxyOutputBackend2 : public ProxyOutputBackend {
+    using ProxyOutputBackend::ProxyOutputBackend;
+  };
+  struct MirroringOutput final : public OutputFileImpl, raw_pwrite_stream {
+    Error keep() final {
+      flush();
+      return joinErrors(F1->keep(), F2->keep());
+    }
+    Error discard() final {
+      flush();
+      return joinErrors(F1->discard(), F2->discard());
+    }
+    raw_pwrite_stream &getOS() final { return *this; }
+
+    void write_impl(const char *Ptr, size_t Size) override {
+      F1->getOS().write(Ptr, Size);
+      F2->getOS().write(Ptr, Size);
+    }
+    void pwrite_impl(const char *Ptr, size_t Size, uint64_t Offset) override {
+      this->flush();
+      F1->getOS().pwrite(Ptr, Size, Offset);
+      F2->getOS().pwrite(Ptr, Size, Offset);
+    }
+    uint64_t current_pos() const override { return F1->getOS().tell(); }
+    size_t preferred_buffer_size() const override {
+      return PreferredBufferSize;
+    }
+    void reserveExtraSpace(uint64_t ExtraSize) override {
+      F1->getOS().reserveExtraSpace(ExtraSize);
+      F2->getOS().reserveExtraSpace(ExtraSize);
+    }
+    bool is_displayed() const override {
+      return F1->getOS().is_displayed() && F2->getOS().is_displayed();
+    }
+    bool has_colors() const override {
+      return F1->getOS().has_colors() && F2->getOS().has_colors();
+    }
+    void enable_colors(bool enable) override {
+      raw_pwrite_stream::enable_colors(enable);
+      F1->getOS().enable_colors(enable);
+      F2->getOS().enable_colors(enable);
+    }
+
+    MirroringOutput(std::unique_ptr<OutputFileImpl> F1,
+                    std::unique_ptr<OutputFileImpl> F2)
+        : PreferredBufferSize(std::max(F1->getOS().GetBufferSize(),
+                                       F1->getOS().GetBufferSize())),
+          F1(std::move(F1)), F2(std::move(F2)) {
+      // Don't double buffer.
+      this->F1->getOS().SetUnbuffered();
+      this->F2->getOS().SetUnbuffered();
+    }
+    size_t PreferredBufferSize;
+    std::unique_ptr<OutputFileImpl> F1;
+    std::unique_ptr<OutputFileImpl> F2;
+  };
+  struct MirroringOutputBackend : public ProxyOutputBackend1,
+                                  public ProxyOutputBackend2 {
+    Expected<std::unique_ptr<OutputFileImpl>>
+    createFileImpl(StringRef Path,
+                   std::optional<OutputConfig> Config) override {
+      std::unique_ptr<OutputFileImpl> File1;
+      std::unique_ptr<OutputFileImpl> File2;
+      if (Error E =
+              ProxyOutputBackend1::createFileImpl(Path, Config).moveInto(File1))
+        return std::move(E);
+      if (Error E =
+              ProxyOutputBackend2::createFileImpl(Path, Config).moveInto(File2))
+        return joinErrors(std::move(E), File1->discard());
+
+      // Skip the extra indirection if one of these is a null output.
+      if (isa<NullOutputFileImpl>(*File1)) {
+        consumeError(File1->discard());
+        return std::move(File2);
+      }
+      if (isa<NullOutputFileImpl>(*File2)) {
+        consumeError(File2->discard());
+        return std::move(File1);
+      }
+      return std::make_unique<MirroringOutput>(std::move(File1),
+                                               std::move(File2));
+    }
+
+    IntrusiveRefCntPtr<OutputBackend> cloneImpl() const override {
+      return IntrusiveRefCntPtr<ProxyOutputBackend1>(
+          makeIntrusiveRefCnt<MirroringOutputBackend>(
+              ProxyOutputBackend1::getUnderlyingBackend().clone(),
+              ProxyOutputBackend2::getUnderlyingBackend().clone()));
+    }
+    void Retain() const { ProxyOutputBackend1::Retain(); }
+    void Release() const { ProxyOutputBackend1::Release(); }
+
+    MirroringOutputBackend(IntrusiveRefCntPtr<OutputBackend> Backend1,
+                           IntrusiveRefCntPtr<OutputBackend> Backend2)
+        : ProxyOutputBackend1(std::move(Backend1)),
+          ProxyOutputBackend2(std::move(Backend2)) {}
+  };
+
+  assert(Backend1 && "Expected actual backend");
+  assert(Backend2 && "Expected actual backend");
+  return IntrusiveRefCntPtr<ProxyOutputBackend1>(
+      makeIntrusiveRefCnt<MirroringOutputBackend>(std::move(Backend1),
+                                                  std::move(Backend2)));
+}
+
+static OutputConfig
+applySettings(std::optional<OutputConfig> &&Config,
+              const OnDiskOutputBackend::OutputSettings &Settings) {
+  if (!Config)
+    Config = Settings.DefaultConfig;
+  if (!Settings.UseTemporaries)
+    Config->setNoAtomicWrite();
+  if (!Settings.RemoveOnSignal)
+    Config->setNoDiscardOnSignal();
+  return *Config;
+}
+
+namespace {
+class OnDiskOutputFile final : public OutputFileImpl {
+public:
+  Error keep() override;
+  Error discard() override;
+  raw_pwrite_stream &getOS() override {
+    assert(FileOS && "Expected valid file");
+    if (BufferOS)
+      return *BufferOS;
+    return *FileOS;
+  }
+
+  /// Attempt to open a temporary file for \p OutputPath.
+  ///
+  /// This tries to open a uniquely-named temporary file for \p OutputPath,
+  /// possibly also creating any missing directories if \a
+  /// OnDiskOutputConfig::UseTemporaryCreateMissingDirectories is set in \a
+  /// Config.
+  ///
+  /// \post FD and \a TempPath are initialized if this is successful.
+  Error tryToCreateTemporary(std::optional<int> &FD);
+
+  Error initializeFile(std::optional<int> &FD);
+  Error initializeStream();
+  Error reset();
+
+  OnDiskOutputFile(StringRef OutputPath, std::optional<OutputConfig> Config,
+                   const OnDiskOutputBackend::OutputSettings &Settings)
+      : Config(applySettings(std::move(Config), Settings)),
+        OutputPath(OutputPath.str()) {}
+
+  OutputConfig Config;
+  const std::string OutputPath;
+  std::optional<std::string> TempPath;
+  std::optional<raw_fd_ostream> FileOS;
+  std::optional<buffer_ostream> BufferOS;
+};
+} // end namespace
+
+static Error createDirectoriesOnDemand(StringRef OutputPath,
+                                       OutputConfig Config,
+                                       llvm::function_ref<Error()> CreateFile) {
+  return handleErrors(CreateFile(), [&](std::unique_ptr<ECError> EC) {
+    if (EC->convertToErrorCode() != std::errc::no_such_file_or_directory ||
+        Config.getNoImplyCreateDirectories())
+      return Error(std::move(EC));
+
+    StringRef ParentPath = sys::path::parent_path(OutputPath);
+    if (std::error_code EC = sys::fs::create_directories(ParentPath))
+      return make_error<OutputError>(ParentPath, EC);
+    return CreateFile();
+  });
+}
+
+Error OnDiskOutputFile::tryToCreateTemporary(std::optional<int> &FD) {
+  // Create a temporary file.
+  // Insert -%%%%%%%% before the extension (if any), and because some tools
+  // (noticeable, clang's own GlobalModuleIndex.cpp) glob for build
+  // artifacts, also append .tmp.
+  StringRef OutputExtension = sys::path::extension(OutputPath);
+  SmallString<128> ModelPath =
+      StringRef(OutputPath).drop_back(OutputExtension.size());
+  ModelPath += "-%%%%%%%%";
+  ModelPath += OutputExtension;
+  ModelPath += ".tmp";
+
+  return createDirectoriesOnDemand(OutputPath, Config, [&]() -> Error {
+    int NewFD;
+    SmallString<128> UniquePath;
+    if (std::error_code EC =
+            sys::fs::createUniqueFile(ModelPath, NewFD, UniquePath))
+      return make_error<TempFileOutputError>(ModelPath, OutputPath, EC);
+
+    if (Config.getDiscardOnSignal())
+      sys::RemoveFileOnSignal(UniquePath);
+
+    TempPath = UniquePath.str().str();
+    FD.emplace(NewFD);
+    return Error::success();
+  });
+}
+
+Error OnDiskOutputFile::initializeFile(std::optional<int> &FD) {
+  assert(OutputPath != "-" && "Unexpected request for FD of stdout");
+
+  // Disable temporary file for other non-regular files, and if we get a status
+  // object, also check if we can write and disable write-through buffers if
+  // appropriate.
+  if (Config.getAtomicWrite()) {
+    sys::fs::file_status Status;
+    sys::fs::status(OutputPath, Status);
+    if (sys::fs::exists(Status)) {
+      if (!sys::fs::is_regular_file(Status))
+        Config.setNoAtomicWrite();
+
+      // Fail now if we can't write to the final destination.
+      if (!sys::fs::can_write(OutputPath))
+        return make_error<OutputError>(
+            OutputPath,
+            std::make_error_code(std::errc::operation_not_permitted));
+    }
+  }
+
+  // If (still) using a temporary file, try to create it (and return success if
+  // that works).
+  if (Config.getAtomicWrite())
+    if (!errorToBool(tryToCreateTemporary(FD)))
+      return Error::success();
+
+  // Not using a temporary file. Open the final output file.
+  return createDirectoriesOnDemand(OutputPath, Config, [&]() -> Error {
+    int NewFD;
+    sys::fs::OpenFlags OF = sys::fs::OF_None;
+    if (Config.getTextWithCRLF())
+      OF |= sys::fs::OF_TextWithCRLF;
+    else if (Config.getText())
+      OF |= sys::fs::OF_Text;
+    if (Config.getAppend())
+      OF |= sys::fs::OF_Append;
+    if (std::error_code EC = sys::fs::openFileForWrite(
+            OutputPath, NewFD, sys::fs::CD_CreateAlways, OF))
+      return convertToOutputError(OutputPath, EC);
+    FD.emplace(NewFD);
+
+    if (Config.getDiscardOnSignal())
+      sys::RemoveFileOnSignal(OutputPath);
+    return Error::success();
+  });
+}
+
+Error OnDiskOutputFile::initializeStream() {
+  // Open the file stream.
+  if (OutputPath == "-") {
+    std::error_code EC;
+    FileOS.emplace(OutputPath, EC);
+    if (EC)
+      return make_error<OutputError>(OutputPath, EC);
+  } else {
+    std::optional<int> FD;
+    if (Error E = initializeFile(FD))
+      return E;
+    FileOS.emplace(*FD, /*shouldClose=*/true);
+  }
+
+  // Buffer the stream if necessary.
+  if (!FileOS->supportsSeeking() && !Config.getText())
+    BufferOS.emplace(*FileOS);
+
+  return Error::success();
+}
+
+namespace {
+class OpenFileRAII {
+  static const int InvalidFd = -1;
+
+public:
+  int Fd = InvalidFd;
+
+  ~OpenFileRAII() {
+    if (Fd != InvalidFd)
+      llvm::sys::Process::SafelyCloseFileDescriptor(Fd);
+  }
+};
+
+enum class FileDifference : uint8_t {
+  /// The source and destination paths refer to the exact same file.
+  IdenticalFile,
+  /// The source and destination paths refer to separate files with identical
+  /// contents.
+  SameContents,
+  /// The source and destination paths refer to separate files with different
+  /// contents.
+  DifferentContents
+};
+} // end anonymous namespace
+
+static Expected<FileDifference>
+areFilesDifferent(const llvm::Twine &Source, const llvm::Twine &Destination) {
+  if (sys::fs::equivalent(Source, Destination))
+    return FileDifference::IdenticalFile;
+
+  OpenFileRAII SourceFile;
+  sys::fs::file_status SourceStatus;
+  // If we can't open the source file, fail.
+  if (std::error_code EC = sys::fs::openFileForRead(Source, SourceFile.Fd))
+    return convertToOutputError(Source, EC);
+
+  // If we can't stat the source file, fail.
+  if (std::error_code EC = sys::fs::status(SourceFile.Fd, SourceStatus))
+    return convertToOutputError(Source, EC);
+
+  OpenFileRAII DestFile;
+  sys::fs::file_status DestStatus;
+  // If we can't open the destination file, report different.
+  if (std::error_code Error =
+          sys::fs::openFileForRead(Destination, DestFile.Fd))
+    return FileDifference::DifferentContents;
+
+  // If we can't open the destination file, report different.
+  if (std::error_code Error = sys::fs::status(DestFile.Fd, DestStatus))
+    return FileDifference::DifferentContents;
+
+  // If the files are different sizes, they must be different.
+  uint64_t Size = SourceStatus.getSize();
+  if (Size != DestStatus.getSize())
+    return FileDifference::DifferentContents;
+
+  // If both files are zero size, they must be the same.
+  if (Size == 0)
+    return FileDifference::SameContents;
+
+  // The two files match in size, so we have to compare the bytes to determine
+  // if they're the same.
+  std::error_code SourceRegionErr;
+  sys::fs::mapped_file_region SourceRegion(
+      sys::fs::convertFDToNativeFile(SourceFile.Fd),
+      sys::fs::mapped_file_region::readonly, Size, 0, SourceRegionErr);
+  if (SourceRegionErr)
+    return convertToOutputError(Source, SourceRegionErr);
+
+  std::error_code DestRegionErr;
+  sys::fs::mapped_file_region DestRegion(
+      sys::fs::convertFDToNativeFile(DestFile.Fd),
+      sys::fs::mapped_file_region::readonly, Size, 0, DestRegionErr);
+
+  if (DestRegionErr)
+    return FileDifference::DifferentContents;
+
+  if (memcmp(SourceRegion.const_data(), DestRegion.const_data(), Size) != 0)
+    return FileDifference::DifferentContents;
+
+  return FileDifference::SameContents;
+}
+
+Error OnDiskOutputFile::reset() {
+  // Destroy the streams to flush them.
+  BufferOS.reset();
+  if (!FileOS)
+    return Error::success();
+
+  // Remember the error in raw_fd_ostream to be reported later.
+  std::error_code EC = FileOS->error();
+  // Clear the error to avoid fatal error when reset.
+  FileOS->clear_error();
+  FileOS.reset();
+  return errorCodeToError(EC);
+}
+
+Error OnDiskOutputFile::keep() {
+  if (auto E = reset())
+    return E;
+
+  // Close the file descriptor and remove crash cleanup before exit.
+  auto RemoveDiscardOnSignal = make_scope_exit([&]() {
+    if (Config.getDiscardOnSignal())
+      sys::DontRemoveFileOnSignal(TempPath ? *TempPath : OutputPath);
+  });
+
+  if (!TempPath)
+    return Error::success();
+
+  // See if we should append instead of move.
+  if (Config.getAppend() && OutputPath != "-") {
+    // Read TempFile for the content to append.
+    auto Content = MemoryBuffer::getFile(*TempPath);
+    if (!Content)
+      return convertToTempFileOutputError(*TempPath, OutputPath,
+                                          Content.getError());
+    while (1) {
+      // Attempt to lock the output file.
+      // Only one process is allowed to append to this file at a time.
+      llvm::LockFileManager Lock(OutputPath);
+      bool Owned;
+      if (Error Err = Lock.tryLock().moveInto(Owned)) {
+        // If we error acquiring a lock, we cannot ensure appends
+        // to the trace file are atomic - cannot ensure output correctness.
+        Lock.unsafeMaybeUnlock();
+        return convertToOutputError(
+            OutputPath, std::make_error_code(std::errc::no_lock_available));
+      }
+      if (Owned) {
+        // Lock acquired, perform the write and release the lock.
+        std::error_code EC;
+        llvm::raw_fd_ostream Out(OutputPath, EC, llvm::sys::fs::OF_Append);
+        if (EC)
+          return convertToOutputError(OutputPath, EC);
+        Out << (*Content)->getBuffer();
+        Out.close();
+        Lock.unsafeMaybeUnlock();
+        if (Out.has_error())
+          return convertToOutputError(OutputPath, Out.error());
+        // Remove temp file and done.
+        (void)sys::fs::remove(*TempPath);
+        return Error::success();
+      }
+      // Someone else owns the lock on this file, wait.
+      switch (Lock.waitForUnlockFor(std::chrono::seconds(256))) {
+      case WaitForUnlockResult::Success:
+        LLVM_FALLTHROUGH;
+      case WaitForUnlockResult::OwnerDied: {
+        continue; // try again to get the lock.
+      }
+      case WaitForUnlockResult::Timeout: {
+        // We could error on timeout to avoid potentially hanging forever, but
+        // it may be more likely that an interrupted process failed to clear
+        // the lock, causing other waiting processes to time-out. Let's clear
+        // the lock and try again right away. If we do start seeing compiler
+        // hangs in this location, we will need to re-consider.
+        Lock.unsafeMaybeUnlock();
+        continue;
+      }
+      }
+      break;
+    }
+  }
+
+  if (Config.getOnlyIfDifferent()) {
+    auto Result = areFilesDifferent(*TempPath, OutputPath);
+    if (!Result)
+      return Result.takeError();
+    switch (*Result) {
+    case FileDifference::IdenticalFile:
+      // Do nothing for a self-move.
+      return Error::success();
+
+    case FileDifference::SameContents:
+      // Files are identical; remove the source file.
+      (void)sys::fs::remove(*TempPath);
+      return Error::success();
+
+    case FileDifference::DifferentContents:
+      break; // Rename the file.
+    }
+  }
+
+  // Move temporary to the final output path and remove it if that fails.
+  std::error_code RenameEC = sys::fs::rename(*TempPath, OutputPath);
+  if (!RenameEC)
+    return Error::success();
+
+  // FIXME: TempPath should be in the same directory as OutputPath but try to
+  // copy the output to see if makes any difference. If this path is used,
+  // investigate why we need to copy.
+  RenameEC = sys::fs::copy_file(*TempPath, OutputPath);
+  (void)sys::fs::remove(*TempPath);
+
+  if (!RenameEC)
+    return Error::success();
+
+  return make_error<TempFileOutputError>(*TempPath, OutputPath, RenameEC);
+}
+
+Error OnDiskOutputFile::discard() {
+  // Destroy the streams to flush them.
+  if (auto E = reset())
+    return E;
+
+  // Nothing on the filesystem to remove for stdout.
+  if (OutputPath == "-")
+    return Error::success();
+
+  auto discardPath = [&](StringRef Path) {
+    std::error_code EC = sys::fs::remove(Path);
+    sys::DontRemoveFileOnSignal(Path);
+    return EC;
+  };
+
+  // Clean up the file that's in-progress.
+  if (!TempPath)
+    return convertToOutputError(OutputPath, discardPath(OutputPath));
+  return convertToTempFileOutputError(*TempPath, OutputPath,
+                                      discardPath(*TempPath));
+}
+
+Error OnDiskOutputBackend::makeAbsolute(SmallVectorImpl<char> &Path) const {
+  return convertToOutputError(StringRef(Path.data(), Path.size()),
+                              sys::fs::make_absolute(Path));
+}
+
+Expected<std::unique_ptr<OutputFileImpl>>
+OnDiskOutputBackend::createFileImpl(StringRef Path,
+                                    std::optional<OutputConfig> Config) {
+  SmallString<256> AbsPath;
+  if (Path != "-") {
+    AbsPath = Path;
+    if (Error E = makeAbsolute(AbsPath))
+      return std::move(E);
+    Path = AbsPath;
+  }
+
+  auto File = std::make_unique<OnDiskOutputFile>(Path, Config, Settings);
+  if (Error E = File->initializeStream())
+    return std::move(E);
+
+  return std::move(File);
+}
diff --git a/llvm/lib/Support/VirtualOutputConfig.cpp b/llvm/lib/Support/VirtualOutputConfig.cpp
new file mode 100644
index 000000000000..4672a0dad65d
--- /dev/null
+++ b/llvm/lib/Support/VirtualOutputConfig.cpp
@@ -0,0 +1,55 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements \c OutputConfig class methods.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/VirtualOutputConfig.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::vfs;
+
+OutputConfig &OutputConfig::setOpenFlags(const sys::fs::OpenFlags &Flags) {
+  // Ignore CRLF on its own as invalid.
+  using namespace llvm::sys::fs;
+  return Flags & OF_Text
+             ? setText().setCRLF(Flags & OF_CRLF).setAppend(Flags & OF_Append)
+             : setBinary().setAppend(Flags & OF_Append);
+}
+
+void OutputConfig::print(raw_ostream &OS) const {
+  OS << "{";
+  bool IsFirst = true;
+  auto printFlag = [&](StringRef FlagName, bool Value) {
+    if (IsFirst)
+      IsFirst = false;
+    else
+      OS << ",";
+    if (!Value)
+      OS << "No";
+    OS << FlagName;
+  };
+
+#define HANDLE_OUTPUT_CONFIG_FLAG(NAME, DEFAULT)                               \
+  if (get##NAME() != DEFAULT)                                                  \
+    printFlag(#NAME, get##NAME());
+#include "llvm/Support/VirtualOutputConfig.def"
+  OS << "}";
+}
+
+LLVM_DUMP_METHOD void OutputConfig::dump() const { print(dbgs()); }
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, OutputConfig Config) {
+  Config.print(OS);
+  return OS;
+}
diff --git a/llvm/lib/Support/VirtualOutputError.cpp b/llvm/lib/Support/VirtualOutputError.cpp
new file mode 100644
index 000000000000..c899c621205f
--- /dev/null
+++ b/llvm/lib/Support/VirtualOutputError.cpp
@@ -0,0 +1,73 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the errors for output virtualization.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/VirtualOutputError.h"
+
+using namespace llvm;
+using namespace llvm::vfs;
+
+void OutputError::anchor() {}
+void OutputConfigError::anchor() {}
+void TempFileOutputError::anchor() {}
+
+char OutputError::ID = 0;
+char OutputConfigError::ID = 0;
+char TempFileOutputError::ID = 0;
+
+void OutputError::log(raw_ostream &OS) const {
+  OS << getOutputPath() << ": ";
+  ECError::log(OS);
+}
+
+void OutputConfigError::log(raw_ostream &OS) const {
+  OutputError::log(OS);
+  OS << ": " << Config;
+}
+
+void TempFileOutputError::log(raw_ostream &OS) const {
+  OS << getTempPath() << " => ";
+  OutputError::log(OS);
+}
+
+namespace {
+class OutputErrorCategory : public std::error_category {
+public:
+  const char *name() const noexcept override;
+  std::string message(int EV) const override;
+};
+} // end namespace
+
+const std::error_category &vfs::output_category() {
+  static OutputErrorCategory ErrorCategory;
+  return ErrorCategory;
+}
+
+const char *OutputErrorCategory::name() const noexcept {
+  return "llvm.vfs.output";
+}
+
+std::string OutputErrorCategory::message(int EV) const {
+  OutputErrorCode E = static_cast<OutputErrorCode>(EV);
+  switch (E) {
+  case OutputErrorCode::invalid_config:
+    return "invalid config";
+  case OutputErrorCode::not_closed:
+    return "output not closed";
+  case OutputErrorCode::already_closed:
+    return "output already closed";
+  case OutputErrorCode::has_open_proxy:
+    return "output has open proxy";
+  }
+  llvm_unreachable(
+      "An enumerator of OutputErrorCode does not have a message defined.");
+}
diff --git a/llvm/lib/Support/VirtualOutputFile.cpp b/llvm/lib/Support/VirtualOutputFile.cpp
new file mode 100644
index 000000000000..62f54266d3be
--- /dev/null
+++ b/llvm/lib/Support/VirtualOutputFile.cpp
@@ -0,0 +1,110 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements \c OutputFile class methods.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/VirtualOutputFile.h"
+#include "llvm/Support/VirtualOutputError.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/raw_ostream_proxy.h"
+
+using namespace llvm;
+using namespace llvm::vfs;
+
+char OutputFileImpl::ID = 0;
+char NullOutputFileImpl::ID = 0;
+
+void OutputFileImpl::anchor() {}
+void NullOutputFileImpl::anchor() {}
+
+class OutputFile::TrackedProxy : public raw_pwrite_stream_proxy {
+public:
+  void resetProxy() {
+    TrackingPointer = nullptr;
+    resetProxiedOS();
+  }
+
+  explicit TrackedProxy(TrackedProxy *&TrackingPointer, raw_pwrite_stream &OS)
+      : raw_pwrite_stream_proxy(OS), TrackingPointer(TrackingPointer) {
+    assert(!TrackingPointer && "Expected to add a proxy");
+    TrackingPointer = this;
+  }
+
+  ~TrackedProxy() override { resetProxy(); }
+
+  TrackedProxy *&TrackingPointer;
+};
+
+Expected<std::unique_ptr<raw_pwrite_stream>> OutputFile::createProxy() {
+  if (OpenProxy)
+    return make_error<OutputError>(getPath(), OutputErrorCode::has_open_proxy);
+
+  return std::make_unique<TrackedProxy>(OpenProxy, getOS());
+}
+
+Error OutputFile::keep() {
+  // Catch double-closing logic bugs.
+  if (LLVM_UNLIKELY(!Impl))
+    report_fatal_error(
+        make_error<OutputError>(getPath(), OutputErrorCode::already_closed));
+
+  // Report a fatal error if there's an open proxy and the file is being kept.
+  // This is safer than relying on clients to remember to flush(). Also call
+  // OutputFile::discard() to give the backend a chance to clean up any
+  // side effects (such as temporaries).
+  if (LLVM_UNLIKELY(OpenProxy))
+    report_fatal_error(joinErrors(
+        make_error<OutputError>(getPath(), OutputErrorCode::has_open_proxy),
+        discard()));
+
+  Error E = Impl->keep();
+  Impl = nullptr;
+  DiscardOnDestroyHandler = nullptr;
+  return E;
+}
+
+Error OutputFile::discard() {
+  // Catch double-closing logic bugs.
+  if (LLVM_UNLIKELY(!Impl))
+    report_fatal_error(
+        make_error<OutputError>(getPath(), OutputErrorCode::already_closed));
+
+  // Be lenient about open proxies since client teardown paths won't
+  // necessarily clean up in the right order. Reset the proxy to flush any
+  // current content; if there is another write, there should be quick crash on
+  // null dereference.
+  if (OpenProxy)
+    OpenProxy->resetProxy();
+
+  Error E = Impl->discard();
+  Impl = nullptr;
+  DiscardOnDestroyHandler = nullptr;
+  return E;
+}
+
+void OutputFile::destroy() {
+  if (!Impl)
+    return;
+
+  // Clean up the file. Move the discard handler into a local since discard
+  // will reset it.
+  auto DiscardHandler = std::move(DiscardOnDestroyHandler);
+  Error E = discard();
+  assert(!Impl && "Expected discard to destroy Impl");
+
+  // If there's no handler, report a fatal error.
+  if (LLVM_UNLIKELY(!DiscardHandler))
+    llvm::report_fatal_error(joinErrors(
+        make_error<OutputError>(getPath(), OutputErrorCode::not_closed),
+        std::move(E)));
+  else if (E)
+    DiscardHandler(std::move(E));
+}
diff --git a/llvm/lib/Support/Windows/Threading.inc b/llvm/lib/Support/Windows/Threading.inc
index b11f216adeba..968423b98486 100644
--- a/llvm/lib/Support/Windows/Threading.inc
+++ b/llvm/lib/Support/Windows/Threading.inc
@@ -31,23 +31,22 @@ llvm_execute_on_thread_impl(unsigned(__stdcall *ThreadFunc)(void *), void *Arg,
   HANDLE hThread = (HANDLE)::_beginthreadex(NULL, StackSizeInBytes.value_or(0),
                                             ThreadFunc, Arg, 0, NULL);
 
-  if (!hThread) {
+  if (!hThread)
     ReportLastErrorFatal("_beginthreadex failed");
-  }
 
   return hThread;
 }
 
 void llvm_thread_join_impl(HANDLE hThread) {
-  if (::WaitForSingleObject(hThread, INFINITE) == WAIT_FAILED) {
+  if (::WaitForSingleObject(hThread, INFINITE) == WAIT_FAILED)
     ReportLastErrorFatal("WaitForSingleObject failed");
-  }
+  if (::CloseHandle(hThread) == FALSE)
+    ReportLastErrorFatal("CloseHandle failed");
 }
 
 void llvm_thread_detach_impl(HANDLE hThread) {
-  if (::CloseHandle(hThread) == FALSE) {
+  if (::CloseHandle(hThread) == FALSE)
     ReportLastErrorFatal("CloseHandle failed");
-  }
 }
 
 DWORD llvm_thread_get_id_impl(HANDLE hThread) { return ::GetThreadId(hThread); }
@@ -202,9 +201,9 @@ template <typename F>
 static bool IterateProcInfo(LOGICAL_PROCESSOR_RELATIONSHIP Relationship, F Fn) {
   DWORD Len = 0;
   BOOL R = ::GetLogicalProcessorInformationEx(Relationship, NULL, &Len);
-  if (R || GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
+  if (R || GetLastError() != ERROR_INSUFFICIENT_BUFFER)
     return false;
-  }
+
   auto *Info = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)calloc(1, Len);
   R = ::GetLogicalProcessorInformationEx(Relationship, Info, &Len);
   if (R) {
diff --git a/llvm/lib/Support/raw_ostream_proxy.cpp b/llvm/lib/Support/raw_ostream_proxy.cpp
new file mode 100644
index 000000000000..2bbaa82f4afa
--- /dev/null
+++ b/llvm/lib/Support/raw_ostream_proxy.cpp
@@ -0,0 +1,15 @@
+//===- raw_ostream_proxy.cpp - Implement the raw_ostream proxies ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/raw_ostream_proxy.h"
+
+using namespace llvm;
+
+void raw_ostream_proxy::anchor() {}
+
+void raw_pwrite_stream_proxy::anchor() {}
diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp
index 3657a15ab198..051a896cfd1b 100644
--- a/llvm/lib/TableGen/Record.cpp
+++ b/llvm/lib/TableGen/Record.cpp
@@ -525,6 +525,14 @@ std::optional<int64_t> BitsInit::convertInitializerToInt() const {
   return Result;
 }
 
+uint64_t BitsInit::convertKnownBitsToInt() const {
+  uint64_t Result = 0;
+  for (auto [Idx, InitV] : enumerate(getBits()))
+    if (auto *Bit = dyn_cast<BitInit>(InitV))
+      Result |= static_cast<int64_t>(Bit->getValue()) << Idx;
+  return Result;
+}
+
 const Init *
 BitsInit::convertInitializerBitRange(ArrayRef<unsigned> Bits) const {
   SmallVector<const Init *, 16> NewBits(Bits.size());
diff --git a/llvm/lib/TableGen/TGParser.cpp b/llvm/lib/TableGen/TGParser.cpp
index 0c6add59cb28..f928ded16186 100644
--- a/llvm/lib/TableGen/TGParser.cpp
+++ b/llvm/lib/TableGen/TGParser.cpp
@@ -33,6 +33,14 @@ using namespace llvm;
 
 namespace llvm {
 
+RecordsEntry::RecordsEntry(std::unique_ptr<Record> Rec) : Rec(std::move(Rec)) {}
+RecordsEntry::RecordsEntry(std::unique_ptr<ForeachLoop> Loop)
+    : Loop(std::move(Loop)) {}
+RecordsEntry::RecordsEntry(std::unique_ptr<Record::AssertionInfo> Assertion)
+    : Assertion(std::move(Assertion)) {}
+RecordsEntry::RecordsEntry(std::unique_ptr<Record::DumpInfo> Dump)
+    : Dump(std::move(Dump)) {}
+
 struct SubClassReference {
   SMRange RefRange;
   const Record *Rec = nullptr;
diff --git a/llvm/lib/TableGen/TGParser.h b/llvm/lib/TableGen/TGParser.h
index 7edb6c7a9aac..09b7d5380695 100644
--- a/llvm/lib/TableGen/TGParser.h
+++ b/llvm/lib/TableGen/TGParser.h
@@ -46,12 +46,10 @@ struct RecordsEntry {
   void dump() const;
 
   RecordsEntry() = default;
-  RecordsEntry(std::unique_ptr<Record> Rec) : Rec(std::move(Rec)) {}
-  RecordsEntry(std::unique_ptr<ForeachLoop> Loop) : Loop(std::move(Loop)) {}
-  RecordsEntry(std::unique_ptr<Record::AssertionInfo> Assertion)
-      : Assertion(std::move(Assertion)) {}
-  RecordsEntry(std::unique_ptr<Record::DumpInfo> Dump)
-      : Dump(std::move(Dump)) {}
+  RecordsEntry(std::unique_ptr<Record> Rec);
+  RecordsEntry(std::unique_ptr<ForeachLoop> Loop);
+  RecordsEntry(std::unique_ptr<Record::AssertionInfo> Assertion);
+  RecordsEntry(std::unique_ptr<Record::DumpInfo> Dump);
 };
 
 /// ForeachLoop - Record the iteration state associated with a for loop.
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index c52487ab8a79..c31a090bba77 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -307,6 +307,7 @@ private:
 
   /// Emit instruction to set float register to zero.
   void emitFMov0(const MachineInstr &MI);
+  void emitFMov0AsFMov(const MachineInstr &MI, Register DestReg);
 
   using MInstToMCSymbol = std::map<const MachineInstr *, MCSymbol *>;
 
@@ -734,7 +735,7 @@ void AArch64AsmPrinter::emitHwasanMemaccessSymbols(Module &M) {
   const Triple &TT = TM.getTargetTriple();
   assert(TT.isOSBinFormatELF());
   std::unique_ptr<MCSubtargetInfo> STI(
-      TM.getTarget().createMCSubtargetInfo(TT.str(), "", ""));
+      TM.getTarget().createMCSubtargetInfo(TT, "", ""));
   assert(STI && "Unable to create subtarget info");
   this->STI = static_cast<const AArch64Subtarget *>(&*STI);
 
@@ -1829,45 +1830,77 @@ void AArch64AsmPrinter::emitMOVK(Register Dest, uint64_t Imm, unsigned Shift) {
 
 void AArch64AsmPrinter::emitFMov0(const MachineInstr &MI) {
   Register DestReg = MI.getOperand(0).getReg();
-  if (STI->hasZeroCycleZeroingFP() && !STI->hasZeroCycleZeroingFPWorkaround() &&
-      STI->isNeonAvailable()) {
-    // Convert H/S register to corresponding D register
-    if (AArch64::H0 <= DestReg && DestReg <= AArch64::H31)
-      DestReg = AArch64::D0 + (DestReg - AArch64::H0);
-    else if (AArch64::S0 <= DestReg && DestReg <= AArch64::S31)
-      DestReg = AArch64::D0 + (DestReg - AArch64::S0);
-    else
-      assert(AArch64::D0 <= DestReg && DestReg <= AArch64::D31);
+  if (!STI->hasZeroCycleZeroingFPWorkaround() && STI->isNeonAvailable()) {
+    if (STI->hasZeroCycleZeroingFPR64()) {
+      // Convert H/S register to corresponding D register
+      const AArch64RegisterInfo *TRI = STI->getRegisterInfo();
+      if (AArch64::FPR16RegClass.contains(DestReg))
+        DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::hsub,
+                                           &AArch64::FPR64RegClass);
+      else if (AArch64::FPR32RegClass.contains(DestReg))
+        DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::ssub,
+                                           &AArch64::FPR64RegClass);
+      else
+        assert(AArch64::FPR64RegClass.contains(DestReg));
+
+      MCInst MOVI;
+      MOVI.setOpcode(AArch64::MOVID);
+      MOVI.addOperand(MCOperand::createReg(DestReg));
+      MOVI.addOperand(MCOperand::createImm(0));
+      EmitToStreamer(*OutStreamer, MOVI);
+    } else if (STI->hasZeroCycleZeroingFPR128()) {
+      // Convert H/S/D register to corresponding Q register
+      const AArch64RegisterInfo *TRI = STI->getRegisterInfo();
+      if (AArch64::FPR16RegClass.contains(DestReg)) {
+        DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::hsub,
+                                           &AArch64::FPR128RegClass);
+      } else if (AArch64::FPR32RegClass.contains(DestReg)) {
+        DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::ssub,
+                                           &AArch64::FPR128RegClass);
+      } else {
+        assert(AArch64::FPR64RegClass.contains(DestReg));
+        DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::dsub,
+                                           &AArch64::FPR128RegClass);
+      }
 
-    MCInst MOVI;
-    MOVI.setOpcode(AArch64::MOVID);
-    MOVI.addOperand(MCOperand::createReg(DestReg));
-    MOVI.addOperand(MCOperand::createImm(0));
-    EmitToStreamer(*OutStreamer, MOVI);
-  } else {
-    MCInst FMov;
-    switch (MI.getOpcode()) {
-    default: llvm_unreachable("Unexpected opcode");
-    case AArch64::FMOVH0:
-      FMov.setOpcode(STI->hasFullFP16() ? AArch64::FMOVWHr : AArch64::FMOVWSr);
-      if (!STI->hasFullFP16())
-        DestReg = (AArch64::S0 + (DestReg - AArch64::H0));
-      FMov.addOperand(MCOperand::createReg(DestReg));
-      FMov.addOperand(MCOperand::createReg(AArch64::WZR));
-      break;
-    case AArch64::FMOVS0:
-      FMov.setOpcode(AArch64::FMOVWSr);
-      FMov.addOperand(MCOperand::createReg(DestReg));
-      FMov.addOperand(MCOperand::createReg(AArch64::WZR));
-      break;
-    case AArch64::FMOVD0:
-      FMov.setOpcode(AArch64::FMOVXDr);
-      FMov.addOperand(MCOperand::createReg(DestReg));
-      FMov.addOperand(MCOperand::createReg(AArch64::XZR));
-      break;
+      MCInst MOVI;
+      MOVI.setOpcode(AArch64::MOVIv2d_ns);
+      MOVI.addOperand(MCOperand::createReg(DestReg));
+      MOVI.addOperand(MCOperand::createImm(0));
+      EmitToStreamer(*OutStreamer, MOVI);
+    } else {
+      emitFMov0AsFMov(MI, DestReg);
     }
-    EmitToStreamer(*OutStreamer, FMov);
+  } else {
+    emitFMov0AsFMov(MI, DestReg);
+  }
+}
+
+void AArch64AsmPrinter::emitFMov0AsFMov(const MachineInstr &MI,
+                                        Register DestReg) {
+  MCInst FMov;
+  switch (MI.getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected opcode");
+  case AArch64::FMOVH0:
+    FMov.setOpcode(STI->hasFullFP16() ? AArch64::FMOVWHr : AArch64::FMOVWSr);
+    if (!STI->hasFullFP16())
+      DestReg = (AArch64::S0 + (DestReg - AArch64::H0));
+    FMov.addOperand(MCOperand::createReg(DestReg));
+    FMov.addOperand(MCOperand::createReg(AArch64::WZR));
+    break;
+  case AArch64::FMOVS0:
+    FMov.setOpcode(AArch64::FMOVWSr);
+    FMov.addOperand(MCOperand::createReg(DestReg));
+    FMov.addOperand(MCOperand::createReg(AArch64::WZR));
+    break;
+  case AArch64::FMOVD0:
+    FMov.setOpcode(AArch64::FMOVXDr);
+    FMov.addOperand(MCOperand::createReg(DestReg));
+    FMov.addOperand(MCOperand::createReg(AArch64::XZR));
+    break;
   }
+  EmitToStreamer(*OutStreamer, FMov);
 }
 
 Register AArch64AsmPrinter::emitPtrauthDiscriminator(uint16_t Disc,
@@ -2229,13 +2262,24 @@ void AArch64AsmPrinter::emitPtrauthBranch(const MachineInstr *MI) {
   if (BrTarget == AddrDisc)
     report_fatal_error("Branch target is signed with its own value");
 
-  // If we are printing BLRA pseudo instruction, then x16 and x17 are
-  // implicit-def'ed by the MI and AddrDisc is not used as any other input, so
-  // try to save one MOV by setting MayUseAddrAsScratch.
+  // If we are printing BLRA pseudo, try to save one MOV by making use of the
+  // fact that x16 and x17 are described as clobbered by the MI instruction and
+  // AddrDisc is not used as any other input.
+  //
+  // Back in the day, emitPtrauthDiscriminator was restricted to only returning
+  // either x16 or x17, meaning the returned register is always among the
+  // implicit-def'ed registers of BLRA pseudo. Now this property can be violated
+  // if isX16X17Safer predicate is false, thus manually check if AddrDisc is
+  // among x16 and x17 to prevent clobbering unexpected registers.
+  //
   // Unlike BLRA, BRA pseudo is used to perform computed goto, and thus not
   // declared as clobbering x16/x17.
+  //
+  // FIXME: Make use of `killed` flags and register masks instead.
+  bool AddrDiscIsImplicitDef =
+      IsCall && (AddrDisc == AArch64::X16 || AddrDisc == AArch64::X17);
   Register DiscReg = emitPtrauthDiscriminator(Disc, AddrDisc, AArch64::X17,
-                                              /*MayUseAddrAsScratch=*/IsCall);
+                                              AddrDiscIsImplicitDef);
   bool IsZeroDisc = DiscReg == AArch64::XZR;
 
   unsigned Opc;
@@ -2862,7 +2906,7 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
       MCInst TmpInst;
       TmpInst.setOpcode(AArch64::MOVIv16b_ns);
       TmpInst.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
-      TmpInst.addOperand(MCOperand::createImm(MI->getOperand(1).getImm()));
+      TmpInst.addOperand(MCOperand::createImm(0));
       EmitToStreamer(*OutStreamer, TmpInst);
       return;
     }
@@ -2968,8 +3012,15 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
     // See the comments in emitPtrauthBranch.
     if (Callee == AddrDisc)
       report_fatal_error("Call target is signed with its own value");
+
+    // After isX16X17Safer predicate was introduced, emitPtrauthDiscriminator is
+    // no longer restricted to only reusing AddrDisc when it is X16 or X17
+    // (which are implicit-def'ed by AUTH_TCRETURN pseudos), thus impose this
+    // restriction manually not to clobber an unexpected register.
+    bool AddrDiscIsImplicitDef =
+        AddrDisc == AArch64::X16 || AddrDisc == AArch64::X17;
     Register DiscReg = emitPtrauthDiscriminator(Disc, AddrDisc, ScratchReg,
-                                                /*MayUseAddrAsScratch=*/true);
+                                                AddrDiscIsImplicitDef);
 
     const bool IsZero = DiscReg == AArch64::XZR;
     const unsigned Opcodes[2][2] = {{AArch64::BRAA, AArch64::BRAAZ},
diff --git a/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp b/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp
index 3436dc9ef452..137ff898e86a 100644
--- a/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp
+++ b/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp
@@ -30,6 +30,14 @@ using namespace llvm;
 #define AARCH64_BRANCH_TARGETS_NAME "AArch64 Branch Targets"
 
 namespace {
+// BTI HINT encoding: base (32) plus 'c' (2) and/or 'j' (4).
+enum : unsigned {
+  BTIBase = 32,   // Base immediate for BTI HINT
+  BTIC = 1u << 1, // 2
+  BTIJ = 1u << 2, // 4
+  BTIMask = BTIC | BTIJ,
+};
+
 class AArch64BranchTargets : public MachineFunctionPass {
 public:
   static char ID;
@@ -42,6 +50,7 @@ private:
   void addBTI(MachineBasicBlock &MBB, bool CouldCall, bool CouldJump,
               bool NeedsWinCFI);
 };
+
 } // end anonymous namespace
 
 char AArch64BranchTargets::ID = 0;
@@ -62,9 +71,8 @@ bool AArch64BranchTargets::runOnMachineFunction(MachineFunction &MF) {
   if (!MF.getInfo<AArch64FunctionInfo>()->branchTargetEnforcement())
     return false;
 
-  LLVM_DEBUG(
-      dbgs() << "********** AArch64 Branch Targets  **********\n"
-             << "********** Function: " << MF.getName() << '\n');
+  LLVM_DEBUG(dbgs() << "********** AArch64 Branch Targets  **********\n"
+                    << "********** Function: " << MF.getName() << '\n');
   const Function &F = MF.getFunction();
 
   // LLVM does not consider basic blocks which are the targets of jump tables
@@ -103,6 +111,12 @@ bool AArch64BranchTargets::runOnMachineFunction(MachineFunction &MF) {
         JumpTableTargets.count(&MBB))
       CouldJump = true;
 
+    if (MBB.isEHPad()) {
+      if (HasWinCFI && (MBB.isEHFuncletEntry() || MBB.isCleanupFuncletEntry()))
+        CouldCall = true;
+      else
+        CouldJump = true;
+    }
     if (CouldCall || CouldJump) {
       addBTI(MBB, CouldCall, CouldJump, HasWinCFI);
       MadeChange = true;
@@ -130,7 +144,12 @@ void AArch64BranchTargets::addBTI(MachineBasicBlock &MBB, bool CouldCall,
 
   auto MBBI = MBB.begin();
 
-  // Skip the meta instructions, those will be removed anyway.
+  // If the block starts with EH_LABEL(s), skip them first.
+  while (MBBI != MBB.end() && MBBI->isEHLabel()) {
+    ++MBBI;
+  }
+
+  // Skip meta/CFI/etc. (and EMITBKEY) to reach the first executable insn.
   for (; MBBI != MBB.end() &&
          (MBBI->isMetaInstruction() || MBBI->getOpcode() == AArch64::EMITBKEY);
        ++MBBI)
@@ -138,16 +157,21 @@ void AArch64BranchTargets::addBTI(MachineBasicBlock &MBB, bool CouldCall,
 
   // SCTLR_EL1.BT[01] is set to 0 by default which means
   // PACI[AB]SP are implicitly BTI C so no BTI C instruction is needed there.
-  if (MBBI != MBB.end() && HintNum == 34 &&
+  if (MBBI != MBB.end() && ((HintNum & BTIMask) == BTIC) &&
       (MBBI->getOpcode() == AArch64::PACIASP ||
        MBBI->getOpcode() == AArch64::PACIBSP))
     return;
 
-  if (HasWinCFI && MBBI->getFlag(MachineInstr::FrameSetup)) {
-    BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
-            TII->get(AArch64::SEH_Nop));
+  // Insert BTI exactly at the first executable instruction.
+  const DebugLoc DL = MBB.findDebugLoc(MBBI);
+  MachineInstr *BTI = BuildMI(MBB, MBBI, DL, TII->get(AArch64::HINT))
+                          .addImm(HintNum)
+                          .getInstr();
+
+  // WinEH: put .seh_nop after BTI when the first real insn is FrameSetup.
+  if (HasWinCFI && MBBI != MBB.end() &&
+      MBBI->getFlag(MachineInstr::FrameSetup)) {
+    auto AfterBTI = std::next(MachineBasicBlock::iterator(BTI));
+    BuildMI(MBB, AfterBTI, DL, TII->get(AArch64::SEH_Nop));
   }
-  BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
-          TII->get(AArch64::HINT))
-      .addImm(HintNum);
 }
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 5f499e5e9700..076a6235eef0 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -369,5 +369,5 @@ def AArch64PostLegalizerCombiner
                         commute_constant_to_rhs, extract_vec_elt_combines,
                         push_freeze_to_prevent_poison_from_propagating,
                         combine_mul_cmlt, combine_use_vector_truncate, 
-                        extmultomull, truncsat_combines]> {
+                        extmultomull, truncsat_combines, lshr_of_trunc_of_lshr]> {
 }
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 57dcd68595ff..79655e1c9529 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1688,6 +1688,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
    }
    case AArch64::InOutZAUsePseudo:
    case AArch64::RequiresZASavePseudo:
+   case AArch64::SMEStateAllocPseudo:
    case AArch64::COALESCER_BARRIER_FPR16:
    case AArch64::COALESCER_BARRIER_FPR32:
    case AArch64::COALESCER_BARRIER_FPR64:
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index c1c1f0a1024d..46f5f0c1ca9d 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -621,25 +621,30 @@ def FeatureZCRegMoveGPR64 : SubtargetFeature<"zcm-gpr64", "HasZeroCycleRegMoveGP
 def FeatureZCRegMoveGPR32 : SubtargetFeature<"zcm-gpr32", "HasZeroCycleRegMoveGPR32", "true",
                                         "Has zero-cycle register moves for GPR32 registers">;
 
+def FeatureZCRegMoveFPR128 : SubtargetFeature<"zcm-fpr128", "HasZeroCycleRegMoveFPR128", "true",
+                                        "Has zero-cycle register moves for FPR128 registers">;
+
 def FeatureZCRegMoveFPR64 : SubtargetFeature<"zcm-fpr64", "HasZeroCycleRegMoveFPR64", "true",
                                         "Has zero-cycle register moves for FPR64 registers">;
 
 def FeatureZCRegMoveFPR32 : SubtargetFeature<"zcm-fpr32", "HasZeroCycleRegMoveFPR32", "true",
                                         "Has zero-cycle register moves for FPR32 registers">;
 
-def FeatureZCZeroingGP : SubtargetFeature<"zcz-gp", "HasZeroCycleZeroingGP", "true",
-                                        "Has zero-cycle zeroing instructions for generic registers">;
+def FeatureZCZeroingGPR64 : SubtargetFeature<"zcz-gpr64", "HasZeroCycleZeroingGPR64", "true",
+                                        "Has zero-cycle zeroing instructions for GPR64 registers">;
+
+def FeatureZCZeroingGPR32 : SubtargetFeature<"zcz-gpr32", "HasZeroCycleZeroingGPR32", "true",
+                                        "Has zero-cycle zeroing instructions for GPR32 registers">;
+
+def FeatureZCZeroingFPR128 : SubtargetFeature<"zcz-fpr128", "HasZeroCycleZeroingFPR128", "true",
+                                        "Has zero-cycle zeroing instructions for FPR128 registers">;
 
 // It is generally beneficial to rewrite "fmov s0, wzr" to "movi d0, #0".
 // as movi is more efficient across all cores. Newer cores can eliminate
 // fmovs early and there is no difference with movi, but this not true for
 // all implementations.
-def FeatureNoZCZeroingFP : SubtargetFeature<"no-zcz-fp", "HasZeroCycleZeroingFP", "false",
-                                        "Has no zero-cycle zeroing instructions for FP registers">;
-
-def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
-                                        "Has zero-cycle zeroing instructions",
-                                        [FeatureZCZeroingGP]>;
+def FeatureNoZCZeroingFPR64 : SubtargetFeature<"no-zcz-fpr64", "HasZeroCycleZeroingFPR64", "false",
+                                        "Has no zero-cycle zeroing instructions for FPR64 registers">;
 
 /// ... but the floating-point version doesn't quite work in rare cases on older
 /// CPUs.
@@ -730,9 +735,13 @@ def FeatureFuseArithmeticLogic : SubtargetFeature<
     "fuse-arith-logic", "HasFuseArithmeticLogic", "true",
     "CPU fuses arithmetic and logic operations">;
 
-def FeatureFuseCCSelect : SubtargetFeature<
-    "fuse-csel", "HasFuseCCSelect", "true",
-    "CPU fuses conditional select operations">;
+def FeatureFuseCmpCSel : SubtargetFeature<
+    "fuse-csel", "HasFuseCmpCSel", "true",
+    "CPU can fuse CMP and CSEL operations">;
+
+def FeatureFuseCmpCSet : SubtargetFeature<
+    "fuse-cset", "HasFuseCmpCSet", "true",
+    "CPU can fuse CMP and CSET operations">;
 
 def FeatureFuseCryptoEOR : SubtargetFeature<
     "fuse-crypto-eor", "HasFuseCryptoEOR", "true",
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 7725fa4f1ccb..175b5e04d82f 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -211,6 +211,7 @@
 #include "AArch64FrameLowering.h"
 #include "AArch64InstrInfo.h"
 #include "AArch64MachineFunctionInfo.h"
+#include "AArch64PrologueEpilogue.h"
 #include "AArch64RegisterInfo.h"
 #include "AArch64Subtarget.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
@@ -218,7 +219,6 @@
 #include "Utils/AArch64SMEAttributes.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/CFIInstBuilder.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
@@ -293,8 +293,6 @@ static cl::opt<bool> DisableMultiVectorSpillFill(
     cl::desc("Disable use of LD/ST pairs for SME2 or SVE2p1"), cl::init(false),
     cl::Hidden);
 
-STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
-
 /// Returns how much of the incoming argument stack area (in bytes) we should
 /// clean up in an epilogue. For the C calling convention this will be 0, for
 /// guaranteed tail call conventions it can be positive (a normal return or a
@@ -328,23 +326,20 @@ static int64_t getArgumentStackToRestore(MachineFunction &MF,
   return ArgumentPopSize;
 }
 
-static bool produceCompactUnwindFrame(MachineFunction &MF);
-static bool needsWinCFI(const MachineFunction &MF);
-static StackOffset getSVEStackSize(const MachineFunction &MF);
-static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB,
-                                                 bool HasCall = false);
-static bool requiresSaveVG(const MachineFunction &MF);
+static bool produceCompactUnwindFrame(const AArch64FrameLowering &,
+                                      MachineFunction &MF);
 
 // Conservatively, returns true if the function is likely to have an SVE vectors
 // on the stack. This function is safe to be called before callee-saves or
 // object offsets have been determined.
-static bool isLikelyToHaveSVEStack(const MachineFunction &MF) {
+static bool isLikelyToHaveSVEStack(const AArch64FrameLowering &AFL,
+                                   const MachineFunction &MF) {
   auto *AFI = MF.getInfo<AArch64FunctionInfo>();
   if (AFI->isSVECC())
     return true;
 
   if (AFI->hasCalculatedStackSizeSVE())
-    return bool(getSVEStackSize(MF));
+    return bool(AFL.getSVEStackSize(MF));
 
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   for (int FI = MFI.getObjectIndexBegin(); FI < MFI.getObjectIndexEnd(); FI++) {
@@ -372,7 +367,7 @@ bool AArch64FrameLowering::homogeneousPrologEpilog(
     return false;
 
   // TODO: SVE is not supported yet.
-  if (isLikelyToHaveSVEStack(MF))
+  if (isLikelyToHaveSVEStack(*this, MF))
     return false;
 
   // Bail on stack adjustment needed on return for simplicity.
@@ -409,7 +404,7 @@ bool AArch64FrameLowering::homogeneousPrologEpilog(
 
 /// Returns true if CSRs should be paired.
 bool AArch64FrameLowering::producePairRegisters(MachineFunction &MF) const {
-  return produceCompactUnwindFrame(MF) || homogeneousPrologEpilog(MF);
+  return produceCompactUnwindFrame(*this, MF) || homogeneousPrologEpilog(MF);
 }
 
 /// This is the biggest offset to the stack pointer we can encode in aarch64
@@ -451,11 +446,10 @@ AArch64FrameLowering::getStackIDForScalableVectors() const {
   return TargetStackID::ScalableVector;
 }
 
-/// Returns the size of the fixed object area (allocated next to sp on entry)
-/// On Win64 this may include a var args area and an UnwindHelp object for EH.
-static unsigned getFixedObjectSize(const MachineFunction &MF,
-                                   const AArch64FunctionInfo *AFI, bool IsWin64,
-                                   bool IsFunclet) {
+unsigned
+AArch64FrameLowering::getFixedObjectSize(const MachineFunction &MF,
+                                         const AArch64FunctionInfo *AFI,
+                                         bool IsWin64, bool IsFunclet) const {
   assert(AFI->getTailCallReservedStack() % 16 == 0 &&
          "Tail call reserved stack must be aligned to 16 bytes");
   if (!IsWin64 || IsFunclet) {
@@ -494,7 +488,8 @@ static unsigned getFixedObjectSize(const MachineFunction &MF,
 }
 
 /// Returns the size of the entire SVE stackframe (calleesaves + spills).
-static StackOffset getSVEStackSize(const MachineFunction &MF) {
+StackOffset
+AArch64FrameLowering::getSVEStackSize(const MachineFunction &MF) const {
   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   return StackOffset::getScalable((int64_t)AFI->getStackSizeSVE());
 }
@@ -683,70 +678,6 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
   return MBB.erase(I);
 }
 
-void AArch64FrameLowering::emitCalleeSavedGPRLocations(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
-  MachineFunction &MF = *MBB.getParent();
-  MachineFrameInfo &MFI = MF.getFrameInfo();
-
-  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
-  if (CSI.empty())
-    return;
-
-  CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup);
-  for (const auto &Info : CSI) {
-    unsigned FrameIdx = Info.getFrameIdx();
-    if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector)
-      continue;
-
-    assert(!Info.isSpilledToReg() && "Spilling to registers not implemented");
-    int64_t Offset = MFI.getObjectOffset(FrameIdx) - getOffsetOfLocalArea();
-    CFIBuilder.buildOffset(Info.getReg(), Offset);
-  }
-}
-
-void AArch64FrameLowering::emitCalleeSavedSVELocations(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
-  MachineFunction &MF = *MBB.getParent();
-  MachineFrameInfo &MFI = MF.getFrameInfo();
-
-  // Add callee saved registers to move list.
-  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
-  if (CSI.empty())
-    return;
-
-  const TargetSubtargetInfo &STI = MF.getSubtarget();
-  const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
-  AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
-  CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup);
-
-  std::optional<int64_t> IncomingVGOffsetFromDefCFA;
-  if (requiresSaveVG(MF)) {
-    auto IncomingVG = *find_if(
-        reverse(CSI), [](auto &Info) { return Info.getReg() == AArch64::VG; });
-    IncomingVGOffsetFromDefCFA =
-        MFI.getObjectOffset(IncomingVG.getFrameIdx()) - getOffsetOfLocalArea();
-  }
-
-  for (const auto &Info : CSI) {
-    if (MFI.getStackID(Info.getFrameIdx()) != TargetStackID::ScalableVector)
-      continue;
-
-    // Not all unwinders may know about SVE registers, so assume the lowest
-    // common denominator.
-    assert(!Info.isSpilledToReg() && "Spilling to registers not implemented");
-    MCRegister Reg = Info.getReg();
-    if (!static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, Reg))
-      continue;
-
-    StackOffset Offset =
-        StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) -
-        StackOffset::getFixed(AFI.getCalleeSavedStackSize(MFI));
-
-    CFIBuilder.insertCFIInst(
-        createCFAOffset(TRI, Reg, Offset, IncomingVGOffsetFromDefCFA));
-  }
-}
-
 void AArch64FrameLowering::resetCFIToInitialState(
     MachineBasicBlock &MBB) const {
 
@@ -1088,8 +1019,8 @@ void AArch64FrameLowering::emitZeroCallUsedRegs(BitVector RegsToZero,
   }
 }
 
-static bool windowsRequiresStackProbe(const MachineFunction &MF,
-                                      uint64_t StackSizeInBytes) {
+bool AArch64FrameLowering::windowsRequiresStackProbe(
+    const MachineFunction &MF, uint64_t StackSizeInBytes) const {
   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
   const AArch64FunctionInfo &MFI = *MF.getInfo<AArch64FunctionInfo>();
   // TODO: When implementing stack protectors, take that into account
@@ -1108,19 +1039,9 @@ static void getLiveRegsForEntryMBB(LivePhysRegs &LiveRegs,
     LiveRegs.addReg(CSRegs[i]);
 }
 
-// Find a scratch register that we can use at the start of the prologue to
-// re-align the stack pointer.  We avoid using callee-save registers since they
-// may appear to be free when this is called from canUseAsPrologue (during
-// shrink wrapping), but then no longer be free when this is called from
-// emitPrologue.
-//
-// FIXME: This is a bit conservative, since in the above case we could use one
-// of the callee-save registers as a scratch temp to re-align the stack pointer,
-// but we would then have to make sure that we were in fact saving at least one
-// callee-save register in the prologue, which is additional complexity that
-// doesn't seem worth the benefit.
-static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB,
-                                                 bool HasCall) {
+Register
+AArch64FrameLowering::findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB,
+                                                       bool HasCall) const {
   MachineFunction *MF = MBB->getParent();
 
   // If MBB is an entry block, use X9 as the scratch register
@@ -1193,13 +1114,14 @@ bool AArch64FrameLowering::canUseAsPrologue(
   return true;
 }
 
-static bool needsWinCFI(const MachineFunction &MF) {
+bool AArch64FrameLowering::needsWinCFI(const MachineFunction &MF) const {
   const Function &F = MF.getFunction();
   return MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
          F.needsUnwindTableEntry();
 }
 
-static bool shouldSignReturnAddressEverywhere(const MachineFunction &MF) {
+bool AArch64FrameLowering::shouldSignReturnAddressEverywhere(
+    const MachineFunction &MF) const {
   // FIXME: With WinCFI, extra care should be taken to place SEH_PACSignLR
   //        and SEH_EpilogEnd instructions in the correct order.
   if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
@@ -1475,13 +1397,13 @@ static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI,
     ImmOpnd->setImm(ImmOpnd->getImm() + LocalStackSize);
 }
 
-bool requiresGetVGCall(MachineFunction &MF) {
-  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+bool AArch64FrameLowering::requiresGetVGCall(const MachineFunction &MF) const {
+  auto *AFI = MF.getInfo<AArch64FunctionInfo>();
   return AFI->hasStreamingModeChanges() &&
          !MF.getSubtarget<AArch64Subtarget>().hasSVE();
 }
 
-static bool requiresSaveVG(const MachineFunction &MF) {
+bool AArch64FrameLowering::requiresSaveVG(const MachineFunction &MF) const {
   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   if (!AFI->needsDwarfUnwindInfo(MF) || !AFI->hasStreamingModeChanges())
     return false;
@@ -1499,8 +1421,8 @@ static bool matchLibcall(const TargetLowering &TLI, const MachineOperand &MO,
          StringRef(TLI.getLibcallName(LC)) == MO.getSymbolName();
 }
 
-bool isVGInstruction(MachineBasicBlock::iterator MBBI,
-                     const TargetLowering &TLI) {
+bool AArch64FrameLowering::isVGInstruction(MachineBasicBlock::iterator MBBI,
+                                           const TargetLowering &TLI) const {
   unsigned Opc = MBBI->getOpcode();
   if (Opc == AArch64::CNTD_XPiI)
     return true;
@@ -1514,15 +1436,12 @@ bool isVGInstruction(MachineBasicBlock::iterator MBBI,
   return Opc == TargetOpcode::COPY;
 }
 
-// Convert callee-save register save/restore instruction to do stack pointer
-// decrement/increment to allocate/deallocate the callee-save stack area by
-// converting store/load to use pre/post increment version.
-static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
+MachineBasicBlock::iterator
+AArch64FrameLowering::convertCalleeSaveRestoreToSPPrePostIncDec(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
     const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc,
     bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFI,
-    MachineInstr::MIFlag FrameFlag = MachineInstr::FrameSetup,
-    int CFAOffset = 0) {
+    MachineInstr::MIFlag FrameFlag, int CFAOffset) const {
   unsigned NewOpc;
 
   // If the function contains streaming mode changes, we expect instructions
@@ -1643,12 +1562,9 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
   return std::prev(MBB.erase(MBBI));
 }
 
-// Fixup callee-save register save/restore instructions to take into account
-// combined SP bump by adding the local stack size to the stack offsets.
-static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
-                                              uint64_t LocalStackSize,
-                                              bool NeedsWinCFI,
-                                              bool *HasWinCFI) {
+void AArch64FrameLowering::fixupCalleeSaveRestoreStackOffset(
+    MachineInstr &MI, uint64_t LocalStackSize, bool NeedsWinCFI,
+    bool *HasWinCFI) const {
   if (AArch64InstrInfo::isSEHInstruction(MI))
     return;
 
@@ -1703,7 +1619,8 @@ static unsigned getStackHazardSize(const MachineFunction &MF) {
 }
 
 // Convenience function to determine whether I is an SVE callee save.
-static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
+bool AArch64FrameLowering::isSVECalleeSave(
+    MachineBasicBlock::iterator I) const {
   switch (I->getOpcode()) {
   default:
     return false;
@@ -1725,42 +1642,6 @@ static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
   }
 }
 
-static void emitShadowCallStackPrologue(const TargetInstrInfo &TII,
-                                        MachineFunction &MF,
-                                        MachineBasicBlock &MBB,
-                                        MachineBasicBlock::iterator MBBI,
-                                        const DebugLoc &DL, bool NeedsWinCFI,
-                                        bool NeedsUnwindInfo) {
-  // Shadow call stack prolog: str x30, [x18], #8
-  BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXpost))
-      .addReg(AArch64::X18, RegState::Define)
-      .addReg(AArch64::LR)
-      .addReg(AArch64::X18)
-      .addImm(8)
-      .setMIFlag(MachineInstr::FrameSetup);
-
-  // This instruction also makes x18 live-in to the entry block.
-  MBB.addLiveIn(AArch64::X18);
-
-  if (NeedsWinCFI)
-    BuildMI(MBB, MBBI, DL, TII.get(AArch64::SEH_Nop))
-        .setMIFlag(MachineInstr::FrameSetup);
-
-  if (NeedsUnwindInfo) {
-    // Emit a CFI instruction that causes 8 to be subtracted from the value of
-    // x18 when unwinding past this frame.
-    static const char CFIInst[] = {
-        dwarf::DW_CFA_val_expression,
-        18, // register
-        2,  // length
-        static_cast<char>(unsigned(dwarf::DW_OP_breg18)),
-        static_cast<char>(-8) & 0x7f, // addend (sleb128)
-    };
-    CFIInstBuilder(MBB, MBBI, MachineInstr::FrameSetup)
-        .buildEscape(StringRef(CFIInst, sizeof(CFIInst)));
-  }
-}
-
 static void emitShadowCallStackEpilogue(const TargetInstrInfo &TII,
                                         MachineFunction &MF,
                                         MachineBasicBlock &MBB,
@@ -1783,36 +1664,6 @@ static void emitShadowCallStackEpilogue(const TargetInstrInfo &TII,
         .buildRestore(AArch64::X18);
 }
 
-// Define the current CFA rule to use the provided FP.
-static void emitDefineCFAWithFP(MachineFunction &MF, MachineBasicBlock &MBB,
-                                MachineBasicBlock::iterator MBBI,
-                                unsigned FixedObject) {
-  const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
-  const AArch64RegisterInfo *TRI = STI.getRegisterInfo();
-  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
-
-  const int OffsetToFirstCalleeSaveFromFP =
-      AFI->getCalleeSaveBaseToFrameRecordOffset() -
-      AFI->getCalleeSavedStackSize();
-  Register FramePtr = TRI->getFrameRegister(MF);
-  CFIInstBuilder(MBB, MBBI, MachineInstr::FrameSetup)
-      .buildDefCFA(FramePtr, FixedObject - OffsetToFirstCalleeSaveFromFP);
-}
-
-#ifndef NDEBUG
-/// Collect live registers from the end of \p MI's parent up to (including) \p
-/// MI in \p LiveRegs.
-static void getLivePhysRegsUpTo(MachineInstr &MI, const TargetRegisterInfo &TRI,
-                                LivePhysRegs &LiveRegs) {
-
-  MachineBasicBlock &MBB = *MI.getParent();
-  LiveRegs.addLiveOuts(MBB);
-  for (const MachineInstr &MI :
-       reverse(make_range(MI.getIterator(), MBB.instr_end())))
-    LiveRegs.stepBackward(MI);
-}
-#endif
-
 void AArch64FrameLowering::emitPacRetPlusLeafHardening(
     MachineFunction &MF) const {
   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
@@ -1848,616 +1699,8 @@ void AArch64FrameLowering::emitPacRetPlusLeafHardening(
 
 void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
                                         MachineBasicBlock &MBB) const {
-  MachineBasicBlock::iterator MBBI = MBB.begin();
-  const MachineFrameInfo &MFI = MF.getFrameInfo();
-  const Function &F = MF.getFunction();
-  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
-  const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
-  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-
-  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
-  bool EmitCFI = AFI->needsDwarfUnwindInfo(MF);
-  bool EmitAsyncCFI = AFI->needsAsyncDwarfUnwindInfo(MF);
-  bool HasFP = hasFP(MF);
-  bool NeedsWinCFI = needsWinCFI(MF);
-  bool HasWinCFI = false;
-  auto Cleanup = make_scope_exit([&]() { MF.setHasWinCFI(HasWinCFI); });
-
-  MachineBasicBlock::iterator End = MBB.end();
-#ifndef NDEBUG
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-  // Collect live register from the end of MBB up to the start of the existing
-  // frame setup instructions.
-  MachineBasicBlock::iterator NonFrameStart = MBB.begin();
-  while (NonFrameStart != End &&
-         NonFrameStart->getFlag(MachineInstr::FrameSetup))
-    ++NonFrameStart;
-
-  LivePhysRegs LiveRegs(*TRI);
-  if (NonFrameStart != MBB.end()) {
-    getLivePhysRegsUpTo(*NonFrameStart, *TRI, LiveRegs);
-    // Ignore registers used for stack management for now.
-    LiveRegs.removeReg(AArch64::SP);
-    LiveRegs.removeReg(AArch64::X19);
-    LiveRegs.removeReg(AArch64::FP);
-    LiveRegs.removeReg(AArch64::LR);
-
-    // X0 will be clobbered by a call to __arm_get_current_vg in the prologue.
-    // This is necessary to spill VG if required where SVE is unavailable, but
-    // X0 is preserved around this call.
-    if (requiresGetVGCall(MF))
-      LiveRegs.removeReg(AArch64::X0);
-  }
-
-  auto VerifyClobberOnExit = make_scope_exit([&]() {
-    if (NonFrameStart == MBB.end())
-      return;
-    // Check if any of the newly instructions clobber any of the live registers.
-    for (MachineInstr &MI :
-         make_range(MBB.instr_begin(), NonFrameStart->getIterator())) {
-      for (auto &Op : MI.operands())
-        if (Op.isReg() && Op.isDef())
-          assert(!LiveRegs.contains(Op.getReg()) &&
-                 "live register clobbered by inserted prologue instructions");
-    }
-  });
-#endif
-
-  bool IsFunclet = MBB.isEHFuncletEntry();
-
-  // At this point, we're going to decide whether or not the function uses a
-  // redzone. In most cases, the function doesn't have a redzone so let's
-  // assume that's false and set it to true in the case that there's a redzone.
-  AFI->setHasRedZone(false);
-
-  // Debug location must be unknown since the first debug location is used
-  // to determine the end of the prologue.
-  DebugLoc DL;
-
-  const auto &MFnI = *MF.getInfo<AArch64FunctionInfo>();
-  if (MFnI.shouldSignReturnAddress(MF)) {
-    // If pac-ret+leaf is in effect, PAUTH_PROLOGUE pseudo instructions
-    // are inserted by emitPacRetPlusLeafHardening().
-    if (!shouldSignReturnAddressEverywhere(MF)) {
-      BuildMI(MBB, MBBI, DL, TII->get(AArch64::PAUTH_PROLOGUE))
-          .setMIFlag(MachineInstr::FrameSetup);
-    }
-    // AArch64PointerAuth pass will insert SEH_PACSignLR
-    HasWinCFI |= NeedsWinCFI;
-  }
-
-  if (MFnI.needsShadowCallStackPrologueEpilogue(MF)) {
-    emitShadowCallStackPrologue(*TII, MF, MBB, MBBI, DL, NeedsWinCFI,
-                                MFnI.needsDwarfUnwindInfo(MF));
-    HasWinCFI |= NeedsWinCFI;
-  }
-
-  if (EmitCFI && MFnI.isMTETagged()) {
-    BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITMTETAGGED))
-        .setMIFlag(MachineInstr::FrameSetup);
-  }
-
-  // We signal the presence of a Swift extended frame to external tools by
-  // storing FP with 0b0001 in bits 63:60. In normal userland operation a simple
-  // ORR is sufficient, it is assumed a Swift kernel would initialize the TBI
-  // bits so that is still true.
-  if (HasFP && AFI->hasSwiftAsyncContext()) {
-    switch (MF.getTarget().Options.SwiftAsyncFramePointer) {
-    case SwiftAsyncFramePointerMode::DeploymentBased:
-      if (Subtarget.swiftAsyncContextIsDynamicallySet()) {
-        // The special symbol below is absolute and has a *value* that can be
-        // combined with the frame pointer to signal an extended frame.
-        BuildMI(MBB, MBBI, DL, TII->get(AArch64::LOADgot), AArch64::X16)
-            .addExternalSymbol("swift_async_extendedFramePointerFlags",
-                               AArch64II::MO_GOT);
-        if (NeedsWinCFI) {
-          BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
-              .setMIFlags(MachineInstr::FrameSetup);
-          HasWinCFI = true;
-        }
-        BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrs), AArch64::FP)
-            .addUse(AArch64::FP)
-            .addUse(AArch64::X16)
-            .addImm(Subtarget.isTargetILP32() ? 32 : 0);
-        if (NeedsWinCFI) {
-          BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
-              .setMIFlags(MachineInstr::FrameSetup);
-          HasWinCFI = true;
-        }
-        break;
-      }
-      [[fallthrough]];
-
-    case SwiftAsyncFramePointerMode::Always:
-      // ORR x29, x29, #0x1000_0000_0000_0000
-      BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXri), AArch64::FP)
-          .addUse(AArch64::FP)
-          .addImm(0x1100)
-          .setMIFlag(MachineInstr::FrameSetup);
-      if (NeedsWinCFI) {
-        BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
-            .setMIFlags(MachineInstr::FrameSetup);
-        HasWinCFI = true;
-      }
-      break;
-
-    case SwiftAsyncFramePointerMode::Never:
-      break;
-    }
-  }
-
-  // All calls are tail calls in GHC calling conv, and functions have no
-  // prologue/epilogue.
-  if (MF.getFunction().getCallingConv() == CallingConv::GHC)
-    return;
-
-  // Set tagged base pointer to the requested stack slot.
-  // Ideally it should match SP value after prologue.
-  std::optional<int> TBPI = AFI->getTaggedBasePointerIndex();
-  if (TBPI)
-    AFI->setTaggedBasePointerOffset(-MFI.getObjectOffset(*TBPI));
-  else
-    AFI->setTaggedBasePointerOffset(MFI.getStackSize());
-
-  const StackOffset &SVEStackSize = getSVEStackSize(MF);
-
-  // getStackSize() includes all the locals in its size calculation. We don't
-  // include these locals when computing the stack size of a funclet, as they
-  // are allocated in the parent's stack frame and accessed via the frame
-  // pointer from the funclet.  We only save the callee saved registers in the
-  // funclet, which are really the callee saved registers of the parent
-  // function, including the funclet.
-  int64_t NumBytes =
-      IsFunclet ? getWinEHFuncletFrameSize(MF) : MFI.getStackSize();
-  if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) {
-    assert(!HasFP && "unexpected function without stack frame but with FP");
-    assert(!SVEStackSize &&
-           "unexpected function without stack frame but with SVE objects");
-    // All of the stack allocation is for locals.
-    AFI->setLocalStackSize(NumBytes);
-    if (!NumBytes) {
-      if (NeedsWinCFI && HasWinCFI) {
-        BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
-            .setMIFlag(MachineInstr::FrameSetup);
-      }
-      return;
-    }
-    // REDZONE: If the stack size is less than 128 bytes, we don't need
-    // to actually allocate.
-    if (canUseRedZone(MF)) {
-      AFI->setHasRedZone(true);
-      ++NumRedZoneFunctions;
-    } else {
-      emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
-                      StackOffset::getFixed(-NumBytes), TII,
-                      MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
-      if (EmitCFI) {
-        // Label used to tie together the PROLOG_LABEL and the MachineMoves.
-        MCSymbol *FrameLabel = MF.getContext().createTempSymbol();
-        // Encode the stack size of the leaf function.
-        CFIInstBuilder(MBB, MBBI, MachineInstr::FrameSetup)
-            .buildDefCFAOffset(NumBytes, FrameLabel);
-      }
-    }
-
-    if (NeedsWinCFI) {
-      HasWinCFI = true;
-      BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
-          .setMIFlag(MachineInstr::FrameSetup);
-    }
-
-    return;
-  }
-
-  bool IsWin64 = Subtarget.isCallingConvWin64(F.getCallingConv(), F.isVarArg());
-  unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
-
-  // Windows unwind can't represent the required stack adjustments if we have
-  // both SVE callee-saves and dynamic stack allocations, and the frame
-  // pointer is before the SVE spills.  The allocation of the frame pointer
-  // must be the last instruction in the prologue so the unwinder can restore
-  // the stack pointer correctly. (And there isn't any unwind opcode for
-  // `addvl sp, x29, -17`.)
-  //
-  // Because of this, we do spills in the opposite order on Windows: first SVE,
-  // then GPRs. The main side-effect of this is that it makes accessing
-  // parameters passed on the stack more expensive.
-  //
-  // We could consider rearranging the spills for simpler cases.
-  bool FPAfterSVECalleeSaves =
-      Subtarget.isTargetWindows() && AFI->getSVECalleeSavedStackSize();
-
-  if (FPAfterSVECalleeSaves && AFI->hasStackHazardSlotIndex())
-    reportFatalUsageError("SME hazard padding is not supported on Windows");
-
-  auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
-  // All of the remaining stack allocations are for locals.
-  AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
-  bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
-  bool HomPrologEpilog = homogeneousPrologEpilog(MF);
-  if (FPAfterSVECalleeSaves) {
-    // If we're doing SVE saves first, we need to immediately allocate space
-    // for fixed objects, then space for the SVE callee saves.
-    //
-    // Windows unwind requires that the scalable size is a multiple of 16;
-    // that's handled when the callee-saved size is computed.
-    auto SaveSize =
-        StackOffset::getScalable(AFI->getSVECalleeSavedStackSize()) +
-        StackOffset::getFixed(FixedObject);
-    allocateStackSpace(MBB, MBBI, 0, SaveSize, NeedsWinCFI, &HasWinCFI,
-                       /*EmitCFI=*/false, StackOffset{},
-                       /*FollowupAllocs=*/true);
-    NumBytes -= FixedObject;
-
-    // Now allocate space for the GPR callee saves.
-    while (MBBI != End && IsSVECalleeSave(MBBI))
-      ++MBBI;
-    MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(
-        MBB, MBBI, DL, TII, -AFI->getCalleeSavedStackSize(), NeedsWinCFI,
-        &HasWinCFI, EmitAsyncCFI);
-    NumBytes -= AFI->getCalleeSavedStackSize();
-  } else if (CombineSPBump) {
-    assert(!SVEStackSize && "Cannot combine SP bump with SVE");
-    emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
-                    StackOffset::getFixed(-NumBytes), TII,
-                    MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI,
-                    EmitAsyncCFI);
-    NumBytes = 0;
-  } else if (HomPrologEpilog) {
-    // Stack has been already adjusted.
-    NumBytes -= PrologueSaveSize;
-  } else if (PrologueSaveSize != 0) {
-    MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(
-        MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI, &HasWinCFI,
-        EmitAsyncCFI);
-    NumBytes -= PrologueSaveSize;
-  }
-  assert(NumBytes >= 0 && "Negative stack allocation size!?");
-
-  // Move past the saves of the callee-saved registers, fixing up the offsets
-  // and pre-inc if we decided to combine the callee-save and local stack
-  // pointer bump above.
-  auto &TLI = *MF.getSubtarget().getTargetLowering();
-  while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) &&
-         !IsSVECalleeSave(MBBI)) {
-    if (CombineSPBump &&
-        // Only fix-up frame-setup load/store instructions.
-        (!requiresSaveVG(MF) || !isVGInstruction(MBBI, TLI)))
-      fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(),
-                                        NeedsWinCFI, &HasWinCFI);
-    ++MBBI;
-  }
-
-  // For funclets the FP belongs to the containing function.
-  if (!IsFunclet && HasFP) {
-    // Only set up FP if we actually need to.
-    int64_t FPOffset = AFI->getCalleeSaveBaseToFrameRecordOffset();
-
-    if (CombineSPBump)
-      FPOffset += AFI->getLocalStackSize();
-
-    if (AFI->hasSwiftAsyncContext()) {
-      // Before we update the live FP we have to ensure there's a valid (or
-      // null) asynchronous context in its slot just before FP in the frame
-      // record, so store it now.
-      const auto &Attrs = MF.getFunction().getAttributes();
-      bool HaveInitialContext = Attrs.hasAttrSomewhere(Attribute::SwiftAsync);
-      if (HaveInitialContext)
-        MBB.addLiveIn(AArch64::X22);
-      Register Reg = HaveInitialContext ? AArch64::X22 : AArch64::XZR;
-      BuildMI(MBB, MBBI, DL, TII->get(AArch64::StoreSwiftAsyncContext))
-          .addUse(Reg)
-          .addUse(AArch64::SP)
-          .addImm(FPOffset - 8)
-          .setMIFlags(MachineInstr::FrameSetup);
-      if (NeedsWinCFI) {
-        // WinCFI and arm64e, where StoreSwiftAsyncContext is expanded
-        // to multiple instructions, should be mutually-exclusive.
-        assert(Subtarget.getTargetTriple().getArchName() != "arm64e");
-        BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
-            .setMIFlags(MachineInstr::FrameSetup);
-        HasWinCFI = true;
-      }
-    }
-
-    if (HomPrologEpilog) {
-      auto Prolog = MBBI;
-      --Prolog;
-      assert(Prolog->getOpcode() == AArch64::HOM_Prolog);
-      Prolog->addOperand(MachineOperand::CreateImm(FPOffset));
-    } else {
-      // Issue    sub fp, sp, FPOffset or
-      //          mov fp,sp          when FPOffset is zero.
-      // Note: All stores of callee-saved registers are marked as "FrameSetup".
-      // This code marks the instruction(s) that set the FP also.
-      emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP,
-                      StackOffset::getFixed(FPOffset), TII,
-                      MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
-      if (NeedsWinCFI && HasWinCFI) {
-        BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
-            .setMIFlag(MachineInstr::FrameSetup);
-        // After setting up the FP, the rest of the prolog doesn't need to be
-        // included in the SEH unwind info.
-        NeedsWinCFI = false;
-      }
-    }
-    if (EmitAsyncCFI)
-      emitDefineCFAWithFP(MF, MBB, MBBI, FixedObject);
-  }
-
-  // Now emit the moves for whatever callee saved regs we have (including FP,
-  // LR if those are saved). Frame instructions for SVE register are emitted
-  // later, after the instruction which actually save SVE regs.
-  if (EmitAsyncCFI)
-    emitCalleeSavedGPRLocations(MBB, MBBI);
-
-  // Alignment is required for the parent frame, not the funclet
-  const bool NeedsRealignment =
-      NumBytes && !IsFunclet && RegInfo->hasStackRealignment(MF);
-  const int64_t RealignmentPadding =
-      (NeedsRealignment && MFI.getMaxAlign() > Align(16))
-          ? MFI.getMaxAlign().value() - 16
-          : 0;
-
-  if (windowsRequiresStackProbe(MF, NumBytes + RealignmentPadding)) {
-    if (AFI->getSVECalleeSavedStackSize())
-      report_fatal_error(
-          "SVE callee saves not yet supported with stack probing");
-
-    // Find an available register to spill the value of X15 to, if X15 is being
-    // used already for nest.
-    unsigned X15Scratch = AArch64::NoRegister;
-    const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
-    if (llvm::any_of(MBB.liveins(),
-                     [&STI](const MachineBasicBlock::RegisterMaskPair &LiveIn) {
-                       return STI.getRegisterInfo()->isSuperOrSubRegisterEq(
-                           AArch64::X15, LiveIn.PhysReg);
-                     })) {
-      X15Scratch = findScratchNonCalleeSaveRegister(&MBB, true);
-      assert(X15Scratch != AArch64::NoRegister &&
-             (X15Scratch < AArch64::X15 || X15Scratch > AArch64::X17));
-#ifndef NDEBUG
-      LiveRegs.removeReg(AArch64::X15); // ignore X15 since we restore it
-#endif
-      BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrr), X15Scratch)
-          .addReg(AArch64::XZR)
-          .addReg(AArch64::X15, RegState::Undef)
-          .addReg(AArch64::X15, RegState::Implicit)
-          .setMIFlag(MachineInstr::FrameSetup);
-    }
-
-    uint64_t NumWords = (NumBytes + RealignmentPadding) >> 4;
-    if (NeedsWinCFI) {
-      HasWinCFI = true;
-      // alloc_l can hold at most 256MB, so assume that NumBytes doesn't
-      // exceed this amount.  We need to move at most 2^24 - 1 into x15.
-      // This is at most two instructions, MOVZ followed by MOVK.
-      // TODO: Fix to use multiple stack alloc unwind codes for stacks
-      // exceeding 256MB in size.
-      if (NumBytes >= (1 << 28))
-        report_fatal_error("Stack size cannot exceed 256MB for stack "
-                           "unwinding purposes");
-
-      uint32_t LowNumWords = NumWords & 0xFFFF;
-      BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVZXi), AArch64::X15)
-          .addImm(LowNumWords)
-          .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
-          .setMIFlag(MachineInstr::FrameSetup);
-      BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
-          .setMIFlag(MachineInstr::FrameSetup);
-      if ((NumWords & 0xFFFF0000) != 0) {
-        BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVKXi), AArch64::X15)
-            .addReg(AArch64::X15)
-            .addImm((NumWords & 0xFFFF0000) >> 16) // High half
-            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16))
-            .setMIFlag(MachineInstr::FrameSetup);
-        BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
-            .setMIFlag(MachineInstr::FrameSetup);
-      }
-    } else {
-      BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), AArch64::X15)
-          .addImm(NumWords)
-          .setMIFlags(MachineInstr::FrameSetup);
-    }
-
-    const char *ChkStk = Subtarget.getChkStkName();
-    switch (MF.getTarget().getCodeModel()) {
-    case CodeModel::Tiny:
-    case CodeModel::Small:
-    case CodeModel::Medium:
-    case CodeModel::Kernel:
-      BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
-          .addExternalSymbol(ChkStk)
-          .addReg(AArch64::X15, RegState::Implicit)
-          .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
-          .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
-          .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
-          .setMIFlags(MachineInstr::FrameSetup);
-      if (NeedsWinCFI) {
-        HasWinCFI = true;
-        BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
-            .setMIFlag(MachineInstr::FrameSetup);
-      }
-      break;
-    case CodeModel::Large:
-      BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVaddrEXT))
-          .addReg(AArch64::X16, RegState::Define)
-          .addExternalSymbol(ChkStk)
-          .addExternalSymbol(ChkStk)
-          .setMIFlags(MachineInstr::FrameSetup);
-      if (NeedsWinCFI) {
-        HasWinCFI = true;
-        BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
-            .setMIFlag(MachineInstr::FrameSetup);
-      }
-
-      BuildMI(MBB, MBBI, DL, TII->get(getBLRCallOpcode(MF)))
-          .addReg(AArch64::X16, RegState::Kill)
-          .addReg(AArch64::X15, RegState::Implicit | RegState::Define)
-          .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
-          .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
-          .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
-          .setMIFlags(MachineInstr::FrameSetup);
-      if (NeedsWinCFI) {
-        HasWinCFI = true;
-        BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
-            .setMIFlag(MachineInstr::FrameSetup);
-      }
-      break;
-    }
-
-    BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP)
-        .addReg(AArch64::SP, RegState::Kill)
-        .addReg(AArch64::X15, RegState::Kill)
-        .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 4))
-        .setMIFlags(MachineInstr::FrameSetup);
-    if (NeedsWinCFI) {
-      HasWinCFI = true;
-      BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
-          .addImm(NumBytes)
-          .setMIFlag(MachineInstr::FrameSetup);
-    }
-    NumBytes = 0;
-
-    if (RealignmentPadding > 0) {
-      if (RealignmentPadding >= 4096) {
-        BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm))
-            .addReg(AArch64::X16, RegState::Define)
-            .addImm(RealignmentPadding)
-            .setMIFlags(MachineInstr::FrameSetup);
-        BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXrx64), AArch64::X15)
-            .addReg(AArch64::SP)
-            .addReg(AArch64::X16, RegState::Kill)
-            .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0))
-            .setMIFlag(MachineInstr::FrameSetup);
-      } else {
-        BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), AArch64::X15)
-            .addReg(AArch64::SP)
-            .addImm(RealignmentPadding)
-            .addImm(0)
-            .setMIFlag(MachineInstr::FrameSetup);
-      }
-
-      uint64_t AndMask = ~(MFI.getMaxAlign().value() - 1);
-      BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
-          .addReg(AArch64::X15, RegState::Kill)
-          .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64));
-      AFI->setStackRealigned(true);
-
-      // No need for SEH instructions here; if we're realigning the stack,
-      // we've set a frame pointer and already finished the SEH prologue.
-      assert(!NeedsWinCFI);
-    }
-    if (X15Scratch != AArch64::NoRegister) {
-      BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrr), AArch64::X15)
-          .addReg(AArch64::XZR)
-          .addReg(X15Scratch, RegState::Undef)
-          .addReg(X15Scratch, RegState::Implicit)
-          .setMIFlag(MachineInstr::FrameSetup);
-    }
-  }
-
-  StackOffset SVECalleeSavesSize = {}, SVELocalsSize = SVEStackSize;
-  MachineBasicBlock::iterator CalleeSavesEnd = MBBI;
-
-  StackOffset CFAOffset =
-      StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes);
-
-  // Process the SVE callee-saves to determine what space needs to be
-  // allocated.
-  if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
-    LLVM_DEBUG(dbgs() << "SVECalleeSavedStackSize = " << CalleeSavedSize
-                      << "\n");
-    SVECalleeSavesSize = StackOffset::getScalable(CalleeSavedSize);
-    SVELocalsSize = SVEStackSize - SVECalleeSavesSize;
-    // Find callee save instructions in frame.
-    // Note: With FPAfterSVECalleeSaves the callee saves have already been
-    // allocated.
-    if (!FPAfterSVECalleeSaves) {
-      MachineBasicBlock::iterator CalleeSavesBegin = MBBI;
-      assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction");
-      while (IsSVECalleeSave(MBBI) && MBBI != MBB.getFirstTerminator())
-        ++MBBI;
-      CalleeSavesEnd = MBBI;
-
-      StackOffset LocalsSize = SVELocalsSize + StackOffset::getFixed(NumBytes);
-      // Allocate space for the callee saves (if any).
-      allocateStackSpace(MBB, CalleeSavesBegin, 0, SVECalleeSavesSize, false,
-                         nullptr, EmitAsyncCFI && !HasFP, CFAOffset,
-                         MFI.hasVarSizedObjects() || LocalsSize);
-    }
-  }
-  CFAOffset += SVECalleeSavesSize;
-
-  if (EmitAsyncCFI)
-    emitCalleeSavedSVELocations(MBB, CalleeSavesEnd);
-
-  // Allocate space for the rest of the frame including SVE locals. Align the
-  // stack as necessary.
-  assert(!(canUseRedZone(MF) && NeedsRealignment) &&
-         "Cannot use redzone with stack realignment");
-  if (!canUseRedZone(MF)) {
-    // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
-    // the correct value here, as NumBytes also includes padding bytes,
-    // which shouldn't be counted here.
-    allocateStackSpace(MBB, CalleeSavesEnd, RealignmentPadding,
-                       SVELocalsSize + StackOffset::getFixed(NumBytes),
-                       NeedsWinCFI, &HasWinCFI, EmitAsyncCFI && !HasFP,
-                       CFAOffset, MFI.hasVarSizedObjects());
-  }
-
-  // If we need a base pointer, set it up here. It's whatever the value of the
-  // stack pointer is at this point. Any variable size objects will be allocated
-  // after this, so we can still use the base pointer to reference locals.
-  //
-  // FIXME: Clarify FrameSetup flags here.
-  // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
-  // needed.
-  // For funclets the BP belongs to the containing function.
-  if (!IsFunclet && RegInfo->hasBasePointer(MF)) {
-    TII->copyPhysReg(MBB, MBBI, DL, RegInfo->getBaseRegister(), AArch64::SP,
-                     false);
-    if (NeedsWinCFI) {
-      HasWinCFI = true;
-      BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
-          .setMIFlag(MachineInstr::FrameSetup);
-    }
-  }
-
-  // The very last FrameSetup instruction indicates the end of prologue. Emit a
-  // SEH opcode indicating the prologue end.
-  if (NeedsWinCFI && HasWinCFI) {
-    BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
-        .setMIFlag(MachineInstr::FrameSetup);
-  }
-
-  // SEH funclets are passed the frame pointer in X1.  If the parent
-  // function uses the base register, then the base register is used
-  // directly, and is not retrieved from X1.
-  if (IsFunclet && F.hasPersonalityFn()) {
-    EHPersonality Per = classifyEHPersonality(F.getPersonalityFn());
-    if (isAsynchronousEHPersonality(Per)) {
-      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::FP)
-          .addReg(AArch64::X1)
-          .setMIFlag(MachineInstr::FrameSetup);
-      MBB.addLiveIn(AArch64::X1);
-    }
-  }
-
-  if (EmitCFI && !EmitAsyncCFI) {
-    if (HasFP) {
-      emitDefineCFAWithFP(MF, MBB, MBBI, FixedObject);
-    } else {
-      StackOffset TotalSize =
-          SVEStackSize + StackOffset::getFixed((int64_t)MFI.getStackSize());
-      CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup);
-      CFIBuilder.insertCFIInst(
-          createDefCFA(*RegInfo, /*FrameReg=*/AArch64::SP, /*Reg=*/AArch64::SP,
-                       TotalSize, /*LastAdjustmentWasScalable=*/false));
-    }
-    emitCalleeSavedGPRLocations(MBB, MBBI);
-    emitCalleeSavedSVELocations(MBB, MBBI);
-  }
+  AArch64PrologueEmitter PrologueEmitter(MF, MBB, *this);
+  PrologueEmitter.emitPrologue();
 }
 
 static bool isFuncletReturnInstr(const MachineInstr &MI) {
@@ -2548,15 +1791,15 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
     AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
   if (homogeneousPrologEpilog(MF, &MBB)) {
     assert(!NeedsWinCFI);
-    auto LastPopI = MBB.getFirstTerminator();
-    if (LastPopI != MBB.begin()) {
-      auto HomogeneousEpilog = std::prev(LastPopI);
+    auto FirstHomogenousEpilogI = MBB.getFirstTerminator();
+    if (FirstHomogenousEpilogI != MBB.begin()) {
+      auto HomogeneousEpilog = std::prev(FirstHomogenousEpilogI);
       if (HomogeneousEpilog->getOpcode() == AArch64::HOM_Epilog)
-        LastPopI = HomogeneousEpilog;
+        FirstHomogenousEpilogI = HomogeneousEpilog;
     }
 
     // Adjust local stack
-    emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
+    emitFrameOffset(MBB, FirstHomogenousEpilogI, DL, AArch64::SP, AArch64::SP,
                     StackOffset::getFixed(AFI->getLocalStackSize()), TII,
                     MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
 
@@ -2602,17 +1845,17 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   // Move past the restores of the callee-saved registers.
   // If we plan on combining the sp bump of the local stack size and the callee
   // save stack size, we might need to adjust the CSR save and restore offsets.
-  MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
+  MachineBasicBlock::iterator FirstGPRRestoreI = MBB.getFirstTerminator();
   MachineBasicBlock::iterator Begin = MBB.begin();
-  while (LastPopI != Begin) {
-    --LastPopI;
-    if (!LastPopI->getFlag(MachineInstr::FrameDestroy) ||
-        (!FPAfterSVECalleeSaves && IsSVECalleeSave(LastPopI))) {
-      ++LastPopI;
+  while (FirstGPRRestoreI != Begin) {
+    --FirstGPRRestoreI;
+    if (!FirstGPRRestoreI->getFlag(MachineInstr::FrameDestroy) ||
+        (!FPAfterSVECalleeSaves && isSVECalleeSave(FirstGPRRestoreI))) {
+      ++FirstGPRRestoreI;
       break;
     } else if (CombineSPBump)
-      fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize(),
-                                        NeedsWinCFI, &HasWinCFI);
+      fixupCalleeSaveRestoreStackOffset(
+          *FirstGPRRestoreI, AFI->getLocalStackSize(), NeedsWinCFI, &HasWinCFI);
   }
 
   if (NeedsWinCFI) {
@@ -2622,9 +1865,9 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
     // arguments. Insert the SEH_EpilogStart and remove it later if it
     // we didn't emit any SEH opcodes to avoid generating WinCFI for
     // functions that don't need it.
-    BuildMI(MBB, LastPopI, DL, TII->get(AArch64::SEH_EpilogStart))
+    BuildMI(MBB, FirstGPRRestoreI, DL, TII->get(AArch64::SEH_EpilogStart))
         .setMIFlag(MachineInstr::FrameDestroy);
-    EpilogStartI = LastPopI;
+    EpilogStartI = FirstGPRRestoreI;
     --EpilogStartI;
   }
 
@@ -2665,7 +1908,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
 
     // When we are about to restore the CSRs, the CFA register is SP again.
     if (EmitCFI && hasFP(MF))
-      CFIInstBuilder(MBB, LastPopI, MachineInstr::FrameDestroy)
+      CFIInstBuilder(MBB, FirstGPRRestoreI, MachineInstr::FrameDestroy)
           .buildDefCFA(AArch64::SP, NumBytes);
 
     emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
@@ -2681,18 +1924,19 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   // Process the SVE callee-saves to determine what space needs to be
   // deallocated.
   StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize;
-  MachineBasicBlock::iterator RestoreBegin = LastPopI, RestoreEnd = LastPopI;
+  MachineBasicBlock::iterator RestoreBegin = FirstGPRRestoreI,
+                              RestoreEnd = FirstGPRRestoreI;
   if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
     if (FPAfterSVECalleeSaves)
       RestoreEnd = MBB.getFirstTerminator();
 
     RestoreBegin = std::prev(RestoreEnd);
     while (RestoreBegin != MBB.begin() &&
-           IsSVECalleeSave(std::prev(RestoreBegin)))
+           isSVECalleeSave(std::prev(RestoreBegin)))
       --RestoreBegin;
 
-    assert(IsSVECalleeSave(RestoreBegin) &&
-           IsSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction");
+    assert(isSVECalleeSave(RestoreBegin) &&
+           isSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction");
 
     StackOffset CalleeSavedSizeAsOffset =
         StackOffset::getScalable(CalleeSavedSize);
@@ -2706,7 +1950,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
     // deallocates non-callee-save SVE allocations.  Otherwise, deallocate
     // them explicitly.
     if (!AFI->isStackRealigned() && !MFI.hasVarSizedObjects()) {
-      emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
+      emitFrameOffset(MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP,
                       DeallocateBefore, TII, MachineInstr::FrameDestroy, false,
                       NeedsWinCFI, &HasWinCFI);
     }
@@ -2796,7 +2040,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
       StackRestoreBytes += AfterCSRPopSize;
 
     emitFrameOffset(
-        MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
+        MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP,
         StackOffset::getFixed(StackRestoreBytes), TII,
         MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI, EmitCFI,
         StackOffset::getFixed((RedZone ? 0 : NumBytes) + PrologueSaveSize));
@@ -2816,17 +2060,17 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   // be able to save any instructions.
   if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned())) {
     emitFrameOffset(
-        MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
+        MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::FP,
         StackOffset::getFixed(-AFI->getCalleeSaveBaseToFrameRecordOffset()),
         TII, MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
   } else if (NumBytes)
-    emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
+    emitFrameOffset(MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP,
                     StackOffset::getFixed(NumBytes), TII,
                     MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
 
   // When we are about to restore the CSRs, the CFA register is SP again.
   if (EmitCFI && hasFP(MF))
-    CFIInstBuilder(MBB, LastPopI, MachineInstr::FrameDestroy)
+    CFIInstBuilder(MBB, FirstGPRRestoreI, MachineInstr::FrameDestroy)
         .buildDefCFA(AArch64::SP, PrologueSaveSize);
 
   // This must be placed after the callee-save restore code because that code
@@ -2926,8 +2170,8 @@ AArch64FrameLowering::getNonLocalFrameIndexReference(const MachineFunction &MF,
   return StackOffset::getFixed(getSEHFrameIndexOffset(MF, FI));
 }
 
-static StackOffset getFPOffset(const MachineFunction &MF,
-                               int64_t ObjectOffset) {
+StackOffset AArch64FrameLowering::getFPOffset(const MachineFunction &MF,
+                                              int64_t ObjectOffset) const {
   const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
   const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
   const Function &F = MF.getFunction();
@@ -2940,8 +2184,8 @@ static StackOffset getFPOffset(const MachineFunction &MF,
   return StackOffset::getFixed(ObjectOffset + FixedObject + FPAdjust);
 }
 
-static StackOffset getStackOffset(const MachineFunction &MF,
-                                  int64_t ObjectOffset) {
+StackOffset AArch64FrameLowering::getStackOffset(const MachineFunction &MF,
+                                                 int64_t ObjectOffset) const {
   const auto &MFI = MF.getFrameInfo();
   return StackOffset::getFixed(ObjectOffset + (int64_t)MFI.getStackSize());
 }
@@ -3139,7 +2383,8 @@ static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
   return getKillRegState(!IsLiveIn);
 }
 
-static bool produceCompactUnwindFrame(MachineFunction &MF) {
+static bool produceCompactUnwindFrame(const AArch64FrameLowering &AFL,
+                                      MachineFunction &MF) {
   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
   AttributeList Attrs = MF.getFunction().getAttributes();
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
@@ -3147,7 +2392,7 @@ static bool produceCompactUnwindFrame(MachineFunction &MF) {
          !(Subtarget.getTargetLowering()->supportSwiftError() &&
            Attrs.hasAttrSomewhere(Attribute::SwiftError)) &&
          MF.getFunction().getCallingConv() != CallingConv::SwiftTail &&
-         !requiresSaveVG(MF) && !AFI->isSVECC();
+         !AFL.requiresSaveVG(MF) && !AFI->isSVECC();
 }
 
 static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2,
@@ -3244,16 +2489,18 @@ bool enableMultiVectorSpillFill(const AArch64Subtarget &Subtarget,
           (!IsLocallyStreaming && Subtarget.isStreaming()));
 }
 
-static void computeCalleeSaveRegisterPairs(
-    MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI,
-    const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
-    bool NeedsFrameRecord) {
+void computeCalleeSaveRegisterPairs(const AArch64FrameLowering &AFL,
+                                    MachineFunction &MF,
+                                    ArrayRef<CalleeSavedInfo> CSI,
+                                    const TargetRegisterInfo *TRI,
+                                    SmallVectorImpl<RegPairInfo> &RegPairs,
+                                    bool NeedsFrameRecord) {
 
   if (CSI.empty())
     return;
 
   bool IsWindows = isTargetWindows(MF);
-  bool NeedsWinCFI = needsWinCFI(MF);
+  bool NeedsWinCFI = AFL.needsWinCFI(MF);
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   unsigned StackHazardSize = getStackHazardSize(MF);
   MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -3262,9 +2509,10 @@ static void computeCalleeSaveRegisterPairs(
   (void)CC;
   // MachO's compact unwind format relies on all registers being stored in
   // pairs.
-  assert((!produceCompactUnwindFrame(MF) || CC == CallingConv::PreserveMost ||
-          CC == CallingConv::PreserveAll || CC == CallingConv::CXX_FAST_TLS ||
-          CC == CallingConv::Win64 || (Count & 1) == 0) &&
+  assert((!produceCompactUnwindFrame(AFL, MF) ||
+          CC == CallingConv::PreserveMost || CC == CallingConv::PreserveAll ||
+          CC == CallingConv::CXX_FAST_TLS || CC == CallingConv::Win64 ||
+          (Count & 1) == 0) &&
          "Odd number of callee-saved regs to spill!");
   int ByteOffset = AFI->getCalleeSavedStackSize();
   int StackFillDir = -1;
@@ -3380,9 +2628,9 @@ static void computeCalleeSaveRegisterPairs(
 
     // MachO's compact unwind format relies on all registers being stored in
     // adjacent register pairs.
-    assert((!produceCompactUnwindFrame(MF) || CC == CallingConv::PreserveMost ||
-            CC == CallingConv::PreserveAll || CC == CallingConv::CXX_FAST_TLS ||
-            CC == CallingConv::Win64 ||
+    assert((!produceCompactUnwindFrame(AFL, MF) ||
+            CC == CallingConv::PreserveMost || CC == CallingConv::PreserveAll ||
+            CC == CallingConv::CXX_FAST_TLS || CC == CallingConv::Win64 ||
             (RPI.isPaired() &&
              ((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) ||
               RPI.Reg1 + 1 == RPI.Reg2))) &&
@@ -3495,7 +2743,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
   DebugLoc DL;
   SmallVector<RegPairInfo, 8> RegPairs;
 
-  computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF));
+  computeCalleeSaveRegisterPairs(*this, MF, CSI, TRI, RegPairs, hasFP(MF));
 
   MachineRegisterInfo &MRI = MF.getRegInfo();
   // Refresh the reserved regs in case there are any potential changes since the
@@ -3707,7 +2955,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
   if (MBBI != MBB.end())
     DL = MBBI->getDebugLoc();
 
-  computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF));
+  computeCalleeSaveRegisterPairs(*this, MF, CSI, TRI, RegPairs, hasFP(MF));
   if (homogeneousPrologEpilog(MF, &MBB)) {
     auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog))
                    .setMIFlag(MachineInstr::FrameDestroy);
@@ -4141,7 +3389,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
       if (producePairRegisters(MF)) {
         if (UnspilledCSGPRPaired == AArch64::NoRegister) {
           // Failed to make a pair for compact unwind format, revert spilling.
-          if (produceCompactUnwindFrame(MF)) {
+          if (produceCompactUnwindFrame(*this, MF)) {
             SavedRegs.reset(UnspilledCSGPR);
             ExtraCSSpill = AArch64::NoRegister;
           }
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index 555a93359c27..a9d65441a4e3 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -19,6 +19,10 @@
 
 namespace llvm {
 
+class TargetLowering;
+class AArch64FunctionInfo;
+class AArch64PrologueEmitter;
+
 class AArch64FrameLowering : public TargetFrameLowering {
 public:
   explicit AArch64FrameLowering()
@@ -130,12 +134,19 @@ public:
     return StackId != TargetStackID::ScalableVector;
   }
 
+  friend class AArch64PrologueEmitter;
   void
   orderFrameObjects(const MachineFunction &MF,
                     SmallVectorImpl<int> &ObjectsToAllocate) const override;
 
   bool isFPReserved(const MachineFunction &MF) const;
 
+  bool needsWinCFI(const MachineFunction &MF) const;
+
+  bool requiresSaveVG(const MachineFunction &MF) const;
+
+  StackOffset getSVEStackSize(const MachineFunction &MF) const;
+
 protected:
   bool hasFPImpl(const MachineFunction &MF) const override;
 
@@ -159,10 +170,6 @@ private:
                                       int &MaxCSFrameIndex) const;
   bool shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock &MBB,
                                                 uint64_t StackBumpBytes) const;
-  void emitCalleeSavedGPRLocations(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MBBI) const;
-  void emitCalleeSavedSVELocations(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MBBI) const;
   void emitCalleeSavedGPRRestores(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator MBBI) const;
   void emitCalleeSavedSVERestores(MachineBasicBlock &MBB,
@@ -196,6 +203,61 @@ private:
 
   void emitRemarks(const MachineFunction &MF,
                    MachineOptimizationRemarkEmitter *ORE) const override;
+
+  bool windowsRequiresStackProbe(const MachineFunction &MF,
+                                 uint64_t StackSizeInBytes) const;
+
+  bool shouldSignReturnAddressEverywhere(const MachineFunction &MF) const;
+
+  StackOffset getFPOffset(const MachineFunction &MF,
+                          int64_t ObjectOffset) const;
+
+  StackOffset getStackOffset(const MachineFunction &MF,
+                             int64_t ObjectOffset) const;
+
+  // Find a scratch register that we can use at the start of the prologue to
+  // re-align the stack pointer.  We avoid using callee-save registers since
+  // they may appear to be free when this is called from canUseAsPrologue
+  // (during shrink wrapping), but then no longer be free when this is called
+  // from emitPrologue.
+  //
+  // FIXME: This is a bit conservative, since in the above case we could use one
+  // of the callee-save registers as a scratch temp to re-align the stack
+  // pointer, but we would then have to make sure that we were in fact saving at
+  // least one callee-save register in the prologue, which is additional
+  // complexity that doesn't seem worth the benefit.
+  Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB,
+                                            bool HasCall = false) const;
+
+  // Convert callee-save register save/restore instruction to do stack pointer
+  // decrement/increment to allocate/deallocate the callee-save stack area by
+  // converting store/load to use pre/post increment version.
+  MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
+      MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+      const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc,
+      bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFI,
+      MachineInstr::MIFlag FrameFlag = MachineInstr::FrameSetup,
+      int CFAOffset = 0) const;
+
+  // Fixup callee-save register save/restore instructions to take into account
+  // combined SP bump by adding the local stack size to the stack offsets.
+  void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
+                                         uint64_t LocalStackSize,
+                                         bool NeedsWinCFI,
+                                         bool *HasWinCFI) const;
+
+  bool isSVECalleeSave(MachineBasicBlock::iterator I) const;
+
+  /// Returns the size of the fixed object area (allocated next to sp on entry)
+  /// On Win64 this may include a var args area and an UnwindHelp object for EH.
+  unsigned getFixedObjectSize(const MachineFunction &MF,
+                              const AArch64FunctionInfo *AFI, bool IsWin64,
+                              bool IsFunclet) const;
+
+  bool isVGInstruction(MachineBasicBlock::iterator MBBI,
+                       const TargetLowering &TLI) const;
+
+  bool requiresGetVGCall(const MachineFunction &MF) const;
 };
 
 } // End llvm namespace
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index bc786f415b55..6fdc981fc21a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -246,9 +246,9 @@ public:
     return false;
   }
 
-  template<MVT::SimpleValueType VT>
+  template <MVT::SimpleValueType VT, bool Negate>
   bool SelectSVEAddSubImm(SDValue N, SDValue &Imm, SDValue &Shift) {
-    return SelectSVEAddSubImm(N, VT, Imm, Shift);
+    return SelectSVEAddSubImm(N, VT, Imm, Shift, Negate);
   }
 
   template <MVT::SimpleValueType VT, bool Negate>
@@ -489,7 +489,8 @@ private:
 
   bool SelectCMP_SWAP(SDNode *N);
 
-  bool SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift);
+  bool SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift,
+                          bool Negate);
   bool SelectSVEAddSubSSatImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift,
                               bool Negate);
   bool SelectSVECpyDupImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift);
@@ -4227,35 +4228,36 @@ bool AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
 }
 
 bool AArch64DAGToDAGISel::SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm,
-                                             SDValue &Shift) {
+                                             SDValue &Shift, bool Negate) {
   if (!isa<ConstantSDNode>(N))
     return false;
 
   SDLoc DL(N);
-  uint64_t Val = cast<ConstantSDNode>(N)
-                     ->getAPIntValue()
-                     .trunc(VT.getFixedSizeInBits())
-                     .getZExtValue();
+  APInt Val =
+      cast<ConstantSDNode>(N)->getAPIntValue().trunc(VT.getFixedSizeInBits());
+
+  if (Negate)
+    Val = -Val;
 
   switch (VT.SimpleTy) {
   case MVT::i8:
     // All immediates are supported.
     Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
-    Imm = CurDAG->getTargetConstant(Val, DL, MVT::i32);
+    Imm = CurDAG->getTargetConstant(Val.getZExtValue(), DL, MVT::i32);
     return true;
   case MVT::i16:
   case MVT::i32:
   case MVT::i64:
     // Support 8bit unsigned immediates.
-    if (Val <= 255) {
+    if ((Val & ~0xff) == 0) {
       Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
-      Imm = CurDAG->getTargetConstant(Val, DL, MVT::i32);
+      Imm = CurDAG->getTargetConstant(Val.getZExtValue(), DL, MVT::i32);
       return true;
     }
     // Support 16bit unsigned immediates that are a multiple of 256.
-    if (Val <= 65280 && Val % 256 == 0) {
+    if ((Val & ~0xff00) == 0) {
       Shift = CurDAG->getTargetConstant(8, DL, MVT::i32);
-      Imm = CurDAG->getTargetConstant(Val >> 8, DL, MVT::i32);
+      Imm = CurDAG->getTargetConstant(Val.lshr(8).getZExtValue(), DL, MVT::i32);
       return true;
     }
     break;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d70a46b0e893..5ffaf2c49b4c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1179,6 +1179,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
 
   setTargetDAGCombine(ISD::SHL);
+  setTargetDAGCombine(ISD::VECTOR_DEINTERLEAVE);
 
   // In case of strict alignment, avoid an excessive number of byte wide stores.
   MaxStoresPerMemsetOptSize = 8;
@@ -1918,6 +1919,20 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     }
   }
 
+  // Handle non-aliasing elements mask
+  if (Subtarget->hasSVE2() ||
+      (Subtarget->hasSME() && Subtarget->isStreaming())) {
+    // FIXME: Support wider fixed-length types when msve-vector-bits is used.
+    for (auto VT : {MVT::v2i32, MVT::v4i16, MVT::v8i8, MVT::v16i8}) {
+      setOperationAction(ISD::LOOP_DEPENDENCE_RAW_MASK, VT, Custom);
+      setOperationAction(ISD::LOOP_DEPENDENCE_WAR_MASK, VT, Custom);
+    }
+    for (auto VT : {MVT::nxv2i1, MVT::nxv4i1, MVT::nxv8i1, MVT::nxv16i1}) {
+      setOperationAction(ISD::LOOP_DEPENDENCE_RAW_MASK, VT, Custom);
+      setOperationAction(ISD::LOOP_DEPENDENCE_WAR_MASK, VT, Custom);
+    }
+  }
+
   // Handle operations that are only available in non-streaming SVE mode.
   if (Subtarget->isSVEAvailable()) {
     for (auto VT : {MVT::nxv16i8,  MVT::nxv8i16, MVT::nxv4i32,  MVT::nxv2i64,
@@ -2585,6 +2600,30 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode(
     Known = Known.intersectWith(Known2);
     break;
   }
+  case AArch64ISD::CSNEG:
+  case AArch64ISD::CSINC:
+  case AArch64ISD::CSINV: {
+    KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
+    KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
+
+    // The result is either:
+    // CSINC: KnownOp0 or KnownOp1 + 1
+    // CSINV: KnownOp0 or ~KnownOp1
+    // CSNEG: KnownOp0 or KnownOp1 * -1
+    if (Op.getOpcode() == AArch64ISD::CSINC)
+      KnownOp1 = KnownBits::add(
+          KnownOp1,
+          KnownBits::makeConstant(APInt(Op.getScalarValueSizeInBits(), 1)));
+    else if (Op.getOpcode() == AArch64ISD::CSINV)
+      std::swap(KnownOp1.Zero, KnownOp1.One);
+    else if (Op.getOpcode() == AArch64ISD::CSNEG)
+      KnownOp1 =
+          KnownBits::mul(KnownOp1, KnownBits::makeConstant(APInt::getAllOnes(
+                                       Op.getScalarValueSizeInBits())));
+
+    Known = KnownOp0.intersectWith(KnownOp1);
+    break;
+  }
   case AArch64ISD::BICi: {
     // Compute the bit cleared value.
     APInt Mask =
@@ -2626,6 +2665,32 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode(
                                        << Op->getConstantOperandVal(1)));
     break;
   }
+  case AArch64ISD::MOVImsl: {
+    unsigned ShiftAmt = AArch64_AM::getShiftValue(Op->getConstantOperandVal(1));
+    Known = KnownBits::makeConstant(APInt(
+        Known.getBitWidth(), ~(~Op->getConstantOperandVal(0) << ShiftAmt)));
+    break;
+  }
+  case AArch64ISD::MOVIedit: {
+    Known = KnownBits::makeConstant(APInt(
+        Known.getBitWidth(),
+        AArch64_AM::decodeAdvSIMDModImmType10(Op->getConstantOperandVal(0))));
+    break;
+  }
+  case AArch64ISD::MVNIshift: {
+    Known = KnownBits::makeConstant(
+        APInt(Known.getBitWidth(),
+              ~(Op->getConstantOperandVal(0) << Op->getConstantOperandVal(1)),
+              /*isSigned*/ false, /*implicitTrunc*/ true));
+    break;
+  }
+  case AArch64ISD::MVNImsl: {
+    unsigned ShiftAmt = AArch64_AM::getShiftValue(Op->getConstantOperandVal(1));
+    Known = KnownBits::makeConstant(
+        APInt(Known.getBitWidth(), (~Op->getConstantOperandVal(0) << ShiftAmt),
+              /*isSigned*/ false, /*implicitTrunc*/ true));
+    break;
+  }
   case AArch64ISD::LOADgot:
   case AArch64ISD::ADDlow: {
     if (!Subtarget->isTargetILP32())
@@ -2984,21 +3049,20 @@ AArch64TargetLowering::EmitInitTPIDR2Object(MachineInstr &MI,
   AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
   TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
   if (TPIDR2.Uses > 0) {
+    // Note: This case just needs to do `SVL << 48`. It is not implemented as we
+    // generally don't support big-endian SVE/SME.
+    if (!Subtarget->isLittleEndian())
+      reportFatalInternalError(
+          "TPIDR2 block initialization is not supported on big-endian targets");
+
     const TargetInstrInfo *TII = Subtarget->getInstrInfo();
-    // Store the buffer pointer to the TPIDR2 stack object.
-    BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui))
+    // Store buffer pointer and num_za_save_slices.
+    // Bytes 10-15 are implicitly zeroed.
+    BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STPXi))
         .addReg(MI.getOperand(0).getReg())
+        .addReg(MI.getOperand(1).getReg())
         .addFrameIndex(TPIDR2.FrameIndex)
         .addImm(0);
-    // Set the reserved bytes (10-15) to zero
-    BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui))
-        .addReg(AArch64::WZR)
-        .addFrameIndex(TPIDR2.FrameIndex)
-        .addImm(5);
-    BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui))
-        .addReg(AArch64::WZR)
-        .addFrameIndex(TPIDR2.FrameIndex)
-        .addImm(3);
   } else
     MFI.RemoveStackObject(TPIDR2.FrameIndex);
 
@@ -3111,21 +3175,24 @@ MachineBasicBlock *
 AArch64TargetLowering::EmitEntryPStateSM(MachineInstr &MI,
                                          MachineBasicBlock *BB) const {
   MachineFunction *MF = BB->getParent();
-  AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  const DebugLoc &DL = MI.getDebugLoc();
   Register ResultReg = MI.getOperand(0).getReg();
-  if (FuncInfo->isPStateSMRegUsed()) {
+  if (MF->getRegInfo().use_empty(ResultReg)) {
+    // Nothing to do. Pseudo erased below.
+  } else if (Subtarget->hasSME()) {
+    BuildMI(*BB, MI, DL, TII->get(AArch64::MRS), ResultReg)
+        .addImm(AArch64SysReg::SVCR)
+        .addReg(AArch64::VG, RegState::Implicit);
+  } else {
     RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;
     const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
-    BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL))
+    BuildMI(*BB, MI, DL, TII->get(AArch64::BL))
         .addExternalSymbol(getLibcallName(LC))
         .addReg(AArch64::X0, RegState::ImplicitDefine)
         .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC)));
-    BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), ResultReg)
+    BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), ResultReg)
         .addReg(AArch64::X0);
-  } else {
-    assert(MI.getMF()->getRegInfo().use_empty(ResultReg) &&
-           "Expected no users of the entry pstate.sm!");
   }
   MI.eraseFromParent();
   return BB;
@@ -4912,6 +4979,18 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
   if (DstWidth < SatWidth)
     return SDValue();
 
+  if (SrcVT == MVT::f16 && SatVT == MVT::i16 && DstVT == MVT::i32) {
+    if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
+      SDValue CVTf32 =
+          DAG.getNode(AArch64ISD::FCVTZS_HALF, DL, MVT::f32, SrcVal);
+      SDValue Bitcast = DAG.getBitcast(DstVT, CVTf32);
+      return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, Bitcast,
+                         DAG.getValueType(SatVT));
+    }
+    SDValue CVTf32 = DAG.getNode(AArch64ISD::FCVTZU_HALF, DL, MVT::f32, SrcVal);
+    return DAG.getBitcast(DstVT, CVTf32);
+  }
+
   SDValue NativeCvt =
       DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
   SDValue Sat;
@@ -5242,6 +5321,56 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
 
 static MVT getSVEContainerType(EVT ContentTy);
 
+SDValue
+AArch64TargetLowering::LowerLOOP_DEPENDENCE_MASK(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  uint64_t EltSize = Op.getConstantOperandVal(2);
+  EVT VT = Op.getValueType();
+  switch (EltSize) {
+  case 1:
+    if (VT != MVT::v16i8 && VT != MVT::nxv16i1)
+      return SDValue();
+    break;
+  case 2:
+    if (VT != MVT::v8i8 && VT != MVT::nxv8i1)
+      return SDValue();
+    break;
+  case 4:
+    if (VT != MVT::v4i16 && VT != MVT::nxv4i1)
+      return SDValue();
+    break;
+  case 8:
+    if (VT != MVT::v2i32 && VT != MVT::nxv2i1)
+      return SDValue();
+    break;
+  default:
+    // Other element sizes are incompatible with whilewr/rw, so expand instead
+    return SDValue();
+  }
+
+  SDValue PtrA = Op.getOperand(0);
+  SDValue PtrB = Op.getOperand(1);
+
+  if (VT.isScalableVT())
+    return DAG.getNode(Op.getOpcode(), DL, VT, PtrA, PtrB, Op.getOperand(2));
+
+  // We can use the SVE whilewr/whilerw instruction to lower this
+  // intrinsic by creating the appropriate sequence of scalable vector
+  // operations and then extracting a fixed-width subvector from the scalable
+  // vector. Scalable vector variants are already legal.
+  EVT ContainerVT =
+      EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+                       VT.getVectorNumElements(), true);
+  EVT WhileVT = ContainerVT.changeElementType(MVT::i1);
+
+  SDValue Mask =
+      DAG.getNode(Op.getOpcode(), DL, WhileVT, PtrA, PtrB, Op.getOperand(2));
+  SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, DL, ContainerVT, Mask);
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, MaskAsInt,
+                     DAG.getVectorIdxConstant(0, DL));
+}
+
 SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
                                             SelectionDAG &DAG) const {
   EVT OpVT = Op.getValueType();
@@ -6000,6 +6129,38 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     EVT PtrVT = getPointerTy(DAG.getDataLayout());
     return DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
   }
+  case Intrinsic::aarch64_sve_whilewr_b:
+    return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2),
+                       DAG.getConstant(1, DL, MVT::i64));
+  case Intrinsic::aarch64_sve_whilewr_h:
+    return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2),
+                       DAG.getConstant(2, DL, MVT::i64));
+  case Intrinsic::aarch64_sve_whilewr_s:
+    return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2),
+                       DAG.getConstant(4, DL, MVT::i64));
+  case Intrinsic::aarch64_sve_whilewr_d:
+    return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2),
+                       DAG.getConstant(8, DL, MVT::i64));
+  case Intrinsic::aarch64_sve_whilerw_b:
+    return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2),
+                       DAG.getConstant(1, DL, MVT::i64));
+  case Intrinsic::aarch64_sve_whilerw_h:
+    return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2),
+                       DAG.getConstant(2, DL, MVT::i64));
+  case Intrinsic::aarch64_sve_whilerw_s:
+    return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2),
+                       DAG.getConstant(4, DL, MVT::i64));
+  case Intrinsic::aarch64_sve_whilerw_d:
+    return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2),
+                       DAG.getConstant(8, DL, MVT::i64));
   case Intrinsic::aarch64_neon_abs: {
     EVT Ty = Op.getValueType();
     if (Ty == MVT::i64) {
@@ -7359,6 +7520,9 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
   default:
     llvm_unreachable("unimplemented operand");
     return SDValue();
+  case ISD::LOOP_DEPENDENCE_RAW_MASK:
+  case ISD::LOOP_DEPENDENCE_WAR_MASK:
+    return LowerLOOP_DEPENDENCE_MASK(Op, DAG);
   case ISD::BITCAST:
     return LowerBITCAST(Op, DAG);
   case ISD::GlobalAddress:
@@ -7873,6 +8037,39 @@ static bool isPassedInFPR(EVT VT) {
          (VT.isFloatingPoint() && !VT.isScalableVector());
 }
 
+SDValue AArch64TargetLowering::lowerEHPadEntry(SDValue Chain, SDLoc const &DL,
+                                               SelectionDAG &DAG) const {
+  assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
+  SDValue Glue = Chain.getValue(1);
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  SMEAttrs SMEFnAttrs = MF.getInfo<AArch64FunctionInfo>()->getSMEFnAttrs();
+
+  // The following conditions are true on entry to an exception handler:
+  // - PSTATE.SM is 0.
+  // - PSTATE.ZA is 0.
+  // - TPIDR2_EL0 is null.
+  // See:
+  // https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#exceptions
+  //
+  // Therefore, if the function that contains this exception handler is a
+  // streaming[-compatible] function, we must re-enable streaming mode.
+  //
+  // These mode changes are usually optimized away in catch blocks as they
+  // occur before the __cxa_begin_catch (which is a non-streaming function),
+  // but are necessary in some cases (such as for cleanups).
+
+  if (SMEFnAttrs.hasStreamingInterfaceOrBody())
+    return changeStreamingMode(DAG, DL, /*Enable=*/true, Chain,
+                               /*Glue*/ Glue, AArch64SME::Always);
+
+  if (SMEFnAttrs.hasStreamingCompatibleInterface())
+    return changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue,
+                               AArch64SME::IfCallerIsStreaming);
+
+  return Chain;
+}
+
 SDValue AArch64TargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
@@ -8292,7 +8489,39 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
   if (Subtarget->hasCustomCallingConv())
     Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
 
-  if (!getTM().useNewSMEABILowering() || Attrs.hasAgnosticZAInterface()) {
+  if (getTM().useNewSMEABILowering()) {
+    if (Subtarget->isTargetWindows() || hasInlineStackProbe(MF)) {
+      SDValue Size;
+      if (Attrs.hasZAState()) {
+        SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
+                                  DAG.getConstant(1, DL, MVT::i32));
+        Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
+      } else if (Attrs.hasAgnosticZAInterface()) {
+        RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;
+        SDValue Callee = DAG.getExternalSymbol(
+            getLibcallName(LC), getPointerTy(DAG.getDataLayout()));
+        auto *RetTy = EVT(MVT::i64).getTypeForEVT(*DAG.getContext());
+        TargetLowering::CallLoweringInfo CLI(DAG);
+        CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
+            getLibcallCallingConv(LC), RetTy, Callee, {});
+        std::tie(Size, Chain) = LowerCallTo(CLI);
+      }
+      if (Size) {
+        SDValue Buffer = DAG.getNode(
+            ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
+            {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
+        Chain = Buffer.getValue(1);
+
+        Register BufferPtr =
+            MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+        Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);
+        Chain = DAG.getNode(AArch64ISD::SME_STATE_ALLOC, DL,
+                            DAG.getVTList(MVT::Other), Chain);
+        FuncInfo->setEarlyAllocSMESaveBuffer(BufferPtr);
+        MFI.CreateVariableSizedObject(Align(16), nullptr);
+      }
+    }
+  } else {
     // Old SME ABI lowering (deprecated):
     // Create a 16 Byte TPIDR2 object. The dynamic buffer
     // will be expanded and stored in the static object later using a
@@ -8313,9 +8542,12 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
                              {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
         MFI.CreateVariableSizedObject(Align(16), nullptr);
       }
+      SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
+                                            DAG.getConstant(1, DL, MVT::i32));
       Chain = DAG.getNode(
           AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other),
-          {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0)});
+          {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0),
+           /*Num save slices*/ NumZaSaveSlices});
     } else if (Attrs.hasAgnosticZAInterface()) {
       // Call __arm_sme_state_size().
       SDValue BufferSize =
@@ -8338,7 +8570,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       Register BufferPtr =
           MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
       FuncInfo->setSMESaveBufferAddr(BufferPtr);
-      Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);
+      Chain = DAG.getCopyToReg(Buffer.getValue(1), DL, BufferPtr, Buffer);
     }
   }
 
@@ -8905,7 +9137,6 @@ SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL,
   SmallVector<SDValue> Ops = {Chain, MSROp};
   unsigned Opcode;
   if (Condition != AArch64SME::Always) {
-    FuncInfo->setPStateSMRegUsed(true);
     Register PStateReg = FuncInfo->getPStateSMReg();
     assert(PStateReg.isValid() && "PStateSM Register is invalid");
     SDValue PStateSM =
@@ -9078,17 +9309,17 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // Determine whether we need any streaming mode changes.
   SMECallAttrs CallAttrs = getSMECallAttrs(MF.getFunction(), *this, CLI);
+
+  std::optional<unsigned> ZAMarkerNode;
   bool UseNewSMEABILowering = getTM().useNewSMEABILowering();
-  bool IsAgnosticZAFunction = CallAttrs.caller().hasAgnosticZAInterface();
-  auto ZAMarkerNode = [&]() -> std::optional<unsigned> {
-    // TODO: Handle agnostic ZA functions.
-    if (!UseNewSMEABILowering || IsAgnosticZAFunction)
-      return std::nullopt;
-    if (!CallAttrs.caller().hasZAState() && !CallAttrs.caller().hasZT0State())
-      return std::nullopt;
-    return CallAttrs.requiresLazySave() ? AArch64ISD::REQUIRES_ZA_SAVE
-                                        : AArch64ISD::INOUT_ZA_USE;
-  }();
+  if (UseNewSMEABILowering) {
+    if (CallAttrs.requiresLazySave() ||
+        CallAttrs.requiresPreservingAllZAState())
+      ZAMarkerNode = AArch64ISD::REQUIRES_ZA_SAVE;
+    else if (CallAttrs.caller().hasZAState() ||
+             CallAttrs.caller().hasZT0State())
+      ZAMarkerNode = AArch64ISD::INOUT_ZA_USE;
+  }
 
   if (IsTailCall) {
     // Check if it's really possible to do a tail call.
@@ -9163,21 +9394,13 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   };
 
   bool RequiresLazySave = !UseNewSMEABILowering && CallAttrs.requiresLazySave();
-  bool RequiresSaveAllZA = CallAttrs.requiresPreservingAllZAState();
+  bool RequiresSaveAllZA =
+      !UseNewSMEABILowering && CallAttrs.requiresPreservingAllZAState();
   if (RequiresLazySave) {
-    const TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
-    MachinePointerInfo MPI =
-        MachinePointerInfo::getStack(MF, TPIDR2.FrameIndex);
+    TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
     SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
         TPIDR2.FrameIndex,
         DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
-    SDValue NumZaSaveSlicesAddr =
-        DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
-                    DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType()));
-    SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
-                                          DAG.getConstant(1, DL, MVT::i32));
-    Chain = DAG.getTruncStore(Chain, DL, NumZaSaveSlices, NumZaSaveSlicesAddr,
-                              MPI, MVT::i16);
     Chain = DAG.getNode(
         ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
         DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
@@ -17599,14 +17822,16 @@ bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
 bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
                                                   Value *LaneMask,
                                                   ShuffleVectorInst *SVI,
-                                                  unsigned Factor) const {
+                                                  unsigned Factor,
+                                                  const APInt &GapMask) const {
 
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
   auto *SI = dyn_cast<StoreInst>(Store);
   if (!SI)
     return false;
-  assert(!LaneMask && "Unexpected mask on store");
+  assert(!LaneMask && GapMask.popcount() == Factor &&
+         "Unexpected mask on store");
 
   auto *VecTy = cast<FixedVectorType>(SVI->getType());
   assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
@@ -20868,13 +21093,6 @@ static bool isNegatedInteger(SDValue Op) {
   return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
 }
 
-static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG) {
-  SDLoc DL(Op);
-  EVT VT = Op.getValueType();
-  SDValue Zero = DAG.getConstant(0, DL, VT);
-  return DAG.getNode(ISD::SUB, DL, VT, Zero, Op);
-}
-
 // Try to fold
 //
 // (neg (csel X, Y)) -> (csel (neg X), (neg Y))
@@ -20893,16 +21111,17 @@ static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG) {
   SDValue N0 = CSel.getOperand(0);
   SDValue N1 = CSel.getOperand(1);
 
-  // If both of them is not negations, it's not worth the folding as it
+  // If neither of them are negations, it's not worth the folding as it
   // introduces two additional negations while reducing one negation.
   if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
     return SDValue();
 
-  SDValue N0N = getNegatedInteger(N0, DAG);
-  SDValue N1N = getNegatedInteger(N1, DAG);
-
   SDLoc DL(N);
   EVT VT = CSel.getValueType();
+
+  SDValue N0N = DAG.getNegative(N0, DL, VT);
+  SDValue N1N = DAG.getNegative(N1, DL, VT);
+
   return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
                      CSel.getOperand(3));
 }
@@ -22087,10 +22306,14 @@ static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
     Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op);
   }
 
+  unsigned PTest = AArch64ISD::PTEST;
+  if (Cond == AArch64CC::ANY_ACTIVE)
+    PTest = AArch64ISD::PTEST_ANY;
+  else if (Cond == AArch64CC::FIRST_ACTIVE)
+    PTest = AArch64ISD::PTEST_FIRST;
+
   // Set condition code (CC) flags.
-  SDValue Test = DAG.getNode(
-      Cond == AArch64CC::ANY_ACTIVE ? AArch64ISD::PTEST_ANY : AArch64ISD::PTEST,
-      DL, MVT::i32, Pg, Op);
+  SDValue Test = DAG.getNode(PTest, DL, MVT::i32, Pg, Op);
 
   // Convert CC to integer based on requested condition.
   // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
@@ -22158,6 +22381,17 @@ static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc,
                      Zero);
 }
 
+static SDValue tryCombineNeonFcvtFP16ToI16(SDNode *N, unsigned Opcode,
+                                           SelectionDAG &DAG) {
+  if (N->getValueType(0) != MVT::i16)
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue CVT = DAG.getNode(Opcode, DL, MVT::f32, N->getOperand(1));
+  SDValue Bitcast = DAG.getBitcast(MVT::i32, CVT);
+  return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Bitcast);
+}
+
 // If a merged operation has no inactive lanes we can relax it to a predicated
 // or unpredicated operation, which potentially allows better isel (perhaps
 // using immediate forms) or relaxing register reuse requirements.
@@ -22411,6 +22645,26 @@ static SDValue performIntrinsicCombine(SDNode *N,
   case Intrinsic::aarch64_neon_uabd:
     return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
                        N->getOperand(1), N->getOperand(2));
+  case Intrinsic::aarch64_neon_fcvtzs:
+    return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTZS_HALF, DAG);
+  case Intrinsic::aarch64_neon_fcvtzu:
+    return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTZU_HALF, DAG);
+  case Intrinsic::aarch64_neon_fcvtas:
+    return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTAS_HALF, DAG);
+  case Intrinsic::aarch64_neon_fcvtau:
+    return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTAU_HALF, DAG);
+  case Intrinsic::aarch64_neon_fcvtms:
+    return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTMS_HALF, DAG);
+  case Intrinsic::aarch64_neon_fcvtmu:
+    return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTMU_HALF, DAG);
+  case Intrinsic::aarch64_neon_fcvtns:
+    return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTNS_HALF, DAG);
+  case Intrinsic::aarch64_neon_fcvtnu:
+    return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTNU_HALF, DAG);
+  case Intrinsic::aarch64_neon_fcvtps:
+    return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTPS_HALF, DAG);
+  case Intrinsic::aarch64_neon_fcvtpu:
+    return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTPU_HALF, DAG);
   case Intrinsic::aarch64_crc32b:
   case Intrinsic::aarch64_crc32cb:
     return tryCombineCRC32(0xff, N, DAG);
@@ -22419,7 +22673,7 @@ static SDValue performIntrinsicCombine(SDNode *N,
     return tryCombineCRC32(0xffff, N, DAG);
   case Intrinsic::aarch64_sve_saddv:
     // There is no i64 version of SADDV because the sign is irrelevant.
-    if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64)
+    if (N->getOperand(2).getValueType().getVectorElementType() == MVT::i64)
       return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
     else
       return combineSVEReductionInt(N, AArch64ISD::SADDV_PRED, DAG);
@@ -24106,6 +24360,7 @@ static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) {
   // Ensure that all elements' bits are either 0s or 1s.
   ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT);
 
+  bool IsLE = DAG.getDataLayout().isLittleEndian();
   SmallVector<SDValue, 16> MaskConstants;
   if (DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable() &&
       VecVT == MVT::v16i8) {
@@ -24113,7 +24368,10 @@ static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) {
     // per entry. We split it into two halves, apply the mask, zip the halves to
     // create 8x 16-bit values, and the perform the vector reduce.
     for (unsigned Half = 0; Half < 2; ++Half) {
-      for (unsigned MaskBit = 1; MaskBit <= 128; MaskBit *= 2) {
+      for (unsigned I = 0; I < 8; ++I) {
+        // On big-endian targets, the lane order in sub-byte vector elements
+        // gets reversed, so we need to flip the bit index.
+        unsigned MaskBit = IsLE ? (1u << I) : (1u << (7 - I));
         MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32));
       }
     }
@@ -24131,8 +24389,9 @@ static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) {
   }
 
   // All other vector sizes.
-  unsigned MaxBitMask = 1u << (VecVT.getVectorNumElements() - 1);
-  for (unsigned MaskBit = 1; MaskBit <= MaxBitMask; MaskBit *= 2) {
+  unsigned NumEl = VecVT.getVectorNumElements();
+  for (unsigned I = 0; I < NumEl; ++I) {
+    unsigned MaskBit = IsLE ? (1u << I) : (1u << (NumEl - 1 - I));
     MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64));
   }
 
@@ -24444,6 +24703,105 @@ static SDValue performSTORECombine(SDNode *N,
   return SDValue();
 }
 
+static bool
+isSequentialConcatOfVectorInterleave(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
+  if (N->getOpcode() != ISD::CONCAT_VECTORS)
+    return false;
+
+  unsigned NumParts = N->getNumOperands();
+
+  // We should be concatenating each sequential result from a
+  // VECTOR_INTERLEAVE.
+  SDNode *InterleaveOp = N->getOperand(0).getNode();
+  if (InterleaveOp->getOpcode() != ISD::VECTOR_INTERLEAVE ||
+      InterleaveOp->getNumOperands() != NumParts)
+    return false;
+
+  for (unsigned I = 0; I < NumParts; I++)
+    if (N->getOperand(I) != SDValue(InterleaveOp, I))
+      return false;
+
+  Ops.append(InterleaveOp->op_begin(), InterleaveOp->op_end());
+  return true;
+}
+
+static SDValue getNarrowMaskForInterleavedOps(SelectionDAG &DAG, SDLoc &DL,
+                                              SDValue WideMask,
+                                              unsigned RequiredNumParts) {
+  if (WideMask->getOpcode() == ISD::CONCAT_VECTORS) {
+    SmallVector<SDValue, 4> MaskInterleaveOps;
+    if (!isSequentialConcatOfVectorInterleave(WideMask.getNode(),
+                                              MaskInterleaveOps))
+      return SDValue();
+
+    if (MaskInterleaveOps.size() != RequiredNumParts)
+      return SDValue();
+
+    // Make sure the inputs to the vector interleave are identical.
+    if (!llvm::all_equal(MaskInterleaveOps))
+      return SDValue();
+
+    return MaskInterleaveOps[0];
+  }
+
+  if (WideMask->getOpcode() != ISD::SPLAT_VECTOR)
+    return SDValue();
+
+  ElementCount EC = WideMask.getValueType().getVectorElementCount();
+  assert(EC.isKnownMultipleOf(RequiredNumParts) &&
+         "Expected element count divisible by number of parts");
+  EC = EC.divideCoefficientBy(RequiredNumParts);
+  return DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::getVectorVT(MVT::i1, EC),
+                     WideMask->getOperand(0));
+}
+
+static SDValue performInterleavedMaskedStoreCombine(
+    SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) {
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
+  MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
+  SDValue WideValue = MST->getValue();
+
+  // Bail out if the stored value has an unexpected number of uses, since we'll
+  // have to perform manual interleaving and may as well just use normal masked
+  // stores. Also, discard masked stores that are truncating or indexed.
+  if (!WideValue.hasOneUse() || !ISD::isNormalMaskedStore(MST) ||
+      !MST->isSimple() || !MST->getOffset().isUndef())
+    return SDValue();
+
+  SmallVector<SDValue, 4> ValueInterleaveOps;
+  if (!isSequentialConcatOfVectorInterleave(WideValue.getNode(),
+                                            ValueInterleaveOps))
+    return SDValue();
+
+  unsigned NumParts = ValueInterleaveOps.size();
+  if (NumParts != 2 && NumParts != 4)
+    return SDValue();
+
+  // At the moment we're unlikely to see a fixed-width vector interleave as
+  // we usually generate shuffles instead.
+  EVT SubVecTy = ValueInterleaveOps[0].getValueType();
+  if (!SubVecTy.isScalableVT() ||
+      SubVecTy.getSizeInBits().getKnownMinValue() != 128 ||
+      !DAG.getTargetLoweringInfo().isTypeLegal(SubVecTy))
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue NarrowMask =
+      getNarrowMaskForInterleavedOps(DAG, DL, MST->getMask(), NumParts);
+  if (!NarrowMask)
+    return SDValue();
+
+  const Intrinsic::ID IID =
+      NumParts == 2 ? Intrinsic::aarch64_sve_st2 : Intrinsic::aarch64_sve_st4;
+  SmallVector<SDValue, 8> NewStOps;
+  NewStOps.append({MST->getChain(), DAG.getConstant(IID, DL, MVT::i32)});
+  NewStOps.append(ValueInterleaveOps);
+  NewStOps.append({NarrowMask, MST->getBasePtr()});
+  return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, NewStOps);
+}
+
 static SDValue performMSTORECombine(SDNode *N,
                                     TargetLowering::DAGCombinerInfo &DCI,
                                     SelectionDAG &DAG,
@@ -24453,6 +24811,9 @@ static SDValue performMSTORECombine(SDNode *N,
   SDValue Mask = MST->getMask();
   SDLoc DL(N);
 
+  if (SDValue Res = performInterleavedMaskedStoreCombine(N, DCI, DAG))
+    return Res;
+
   // If this is a UZP1 followed by a masked store, fold this into a masked
   // truncating store.  We can do this even if this is already a masked
   // truncstore.
@@ -26523,6 +26884,26 @@ performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
     return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
   }
 
+  // Sign extend of CSET -> CSETM.
+  if (Opc == AArch64ISD::CSEL &&
+      cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i1) {
+    EVT VT = N->getValueType(0);
+    SDValue TVal = Src.getOperand(0);
+    SDValue FVal = Src.getOperand(1);
+
+    // SIGN_EXTEND_INREG (CSEL 0, 1, cc, NZCV), i1 --> CSEL 0, -1, cc, NZCV
+    if (isNullConstant(TVal) && isOneConstant(FVal))
+      return DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal,
+                         DAG.getAllOnesConstant(DL, VT), Src.getOperand(2),
+                         Src.getOperand(3));
+
+    // SIGN_EXTEND_INREG (CSEL 1, 0, cc, NZCV), i1 --> CSEL -1, 0, cc, NZCV
+    if (isOneConstant(TVal) && isNullConstant(FVal))
+      return DAG.getNode(AArch64ISD::CSEL, DL, VT,
+                         DAG.getAllOnesConstant(DL, VT), FVal,
+                         Src.getOperand(2), Src.getOperand(3));
+  }
+
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
@@ -27020,6 +27401,83 @@ performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
   return NVCAST;
 }
 
+static SDValue performVectorDeinterleaveCombine(
+    SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) {
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
+  unsigned NumParts = N->getNumOperands();
+  if (NumParts != 2 && NumParts != 4)
+    return SDValue();
+
+  EVT SubVecTy = N->getValueType(0);
+
+  // At the moment we're unlikely to see a fixed-width vector deinterleave as
+  // we usually generate shuffles instead.
+  unsigned MinNumElements = SubVecTy.getVectorMinNumElements();
+  if (!SubVecTy.isScalableVector() ||
+      SubVecTy.getSizeInBits().getKnownMinValue() != 128 ||
+      !DAG.getTargetLoweringInfo().isTypeLegal(SubVecTy))
+    return SDValue();
+
+  // Make sure each input operand is the correct extract_subvector of the same
+  // wider vector.
+  SDValue Op0 = N->getOperand(0);
+  for (unsigned I = 0; I < NumParts; I++) {
+    SDValue OpI = N->getOperand(I);
+    if (OpI->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+        OpI->getOperand(0) != Op0->getOperand(0))
+      return SDValue();
+    if (OpI->getConstantOperandVal(1) != (I * MinNumElements))
+      return SDValue();
+  }
+
+  // Normal loads are currently already handled by the InterleavedAccessPass so
+  // we don't expect to see them here. Bail out if the masked load has an
+  // unexpected number of uses, since we want to avoid a situation where we have
+  // both deinterleaving loads and normal loads in the same block. Also, discard
+  // masked loads that are extending, indexed, have an unexpected offset or have
+  // an unsupported passthru value until we find a valid use case.
+  auto MaskedLoad = dyn_cast<MaskedLoadSDNode>(Op0->getOperand(0));
+  if (!MaskedLoad || !MaskedLoad->hasNUsesOfValue(NumParts, 0) ||
+      !MaskedLoad->isSimple() || !ISD::isNormalMaskedLoad(MaskedLoad) ||
+      !MaskedLoad->getOffset().isUndef() ||
+      (!MaskedLoad->getPassThru()->isUndef() &&
+       !isZerosVector(MaskedLoad->getPassThru().getNode())))
+    return SDValue();
+
+  // Now prove that the mask is an interleave of identical masks.
+  SDLoc DL(N);
+  SDValue NarrowMask =
+      getNarrowMaskForInterleavedOps(DAG, DL, MaskedLoad->getMask(), NumParts);
+  if (!NarrowMask)
+    return SDValue();
+
+  const Intrinsic::ID IID = NumParts == 2 ? Intrinsic::aarch64_sve_ld2_sret
+                                          : Intrinsic::aarch64_sve_ld4_sret;
+  SDValue NewLdOps[] = {MaskedLoad->getChain(),
+                        DAG.getConstant(IID, DL, MVT::i32), NarrowMask,
+                        MaskedLoad->getBasePtr()};
+  SDValue Res;
+  if (NumParts == 2)
+    Res = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
+                      {SubVecTy, SubVecTy, MVT::Other}, NewLdOps);
+  else
+    Res = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
+                      {SubVecTy, SubVecTy, SubVecTy, SubVecTy, MVT::Other},
+                      NewLdOps);
+
+  // We can now generate a structured load!
+  SmallVector<SDValue, 4> ResOps(NumParts);
+  for (unsigned Idx = 0; Idx < NumParts; Idx++)
+    ResOps[Idx] = SDValue(Res.getNode(), Idx);
+
+  // Replace uses of the original chain result with the new chain result.
+  DAG.ReplaceAllUsesOfValueWith(SDValue(MaskedLoad, 1),
+                                SDValue(Res.getNode(), NumParts));
+  return DCI.CombineTo(N, ResOps, false);
+}
+
 /// If the operand is a bitwise AND with a constant RHS, and the shift has a
 /// constant RHS and is the only use, we can pull it out of the shift, i.e.
 ///
@@ -27088,6 +27546,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
   default:
     LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
     break;
+  case ISD::VECTOR_DEINTERLEAVE:
+    return performVectorDeinterleaveCombine(N, DCI, DAG);
   case ISD::VECREDUCE_AND:
   case ISD::VECREDUCE_OR:
   case ISD::VECREDUCE_XOR:
@@ -30640,10 +31100,41 @@ bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
       Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
 }
 
+bool AArch64TargetLowering::canCreateUndefOrPoisonForTargetNode(
+    SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
+    bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
+
+  // TODO: Add more target nodes.
+  switch (Op.getOpcode()) {
+  case AArch64ISD::MOVI:
+  case AArch64ISD::MOVIedit:
+  case AArch64ISD::MOVImsl:
+  case AArch64ISD::MOVIshift:
+  case AArch64ISD::MVNImsl:
+  case AArch64ISD::MVNIshift:
+  case AArch64ISD::VASHR:
+  case AArch64ISD::VLSHR:
+  case AArch64ISD::VSHL:
+    return false;
+  }
+  return TargetLowering::canCreateUndefOrPoisonForTargetNode(
+      Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
+}
+
 bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
   return Op.getOpcode() == AArch64ISD::DUP ||
          Op.getOpcode() == AArch64ISD::MOVI ||
          Op.getOpcode() == AArch64ISD::MOVIshift ||
+         Op.getOpcode() == AArch64ISD::MOVImsl ||
+         Op.getOpcode() == AArch64ISD::MOVIedit ||
+         Op.getOpcode() == AArch64ISD::MVNIshift ||
+         Op.getOpcode() == AArch64ISD::MVNImsl ||
+         // Ignoring fneg(movi(0)), because if it is folded to FPConstant(-0.0),
+         // ISel will select fmov(mov i64 0x8000000000000000), resulting in a
+         // fmov from fpr to gpr, which is more expensive than fneg(movi(0))
+         (Op.getOpcode() == ISD::FNEG &&
+          Op.getOperand(0).getOpcode() == AArch64ISD::MOVIedit &&
+          Op.getOperand(0).getConstantOperandVal(0) == 0) ||
          (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
           Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
          TargetLowering::isTargetCanonicalConstantNode(Op);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 6c6ae782f779..f5d14905cac6 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -233,8 +233,8 @@ public:
                             ArrayRef<unsigned> Indices, unsigned Factor,
                             const APInt &GapMask) const override;
   bool lowerInterleavedStore(Instruction *Store, Value *Mask,
-                             ShuffleVectorInst *SVI,
-                             unsigned Factor) const override;
+                             ShuffleVectorInst *SVI, unsigned Factor,
+                             const APInt &GapMask) const override;
 
   bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
                                         IntrinsicInst *DI) const override;
@@ -575,6 +575,9 @@ private:
 
   bool shouldExpandBuildVectorWithShuffles(EVT, unsigned) const override;
 
+  SDValue lowerEHPadEntry(SDValue Chain, SDLoc const &DL,
+                          SelectionDAG &DAG) const override;
+
   SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
                                bool isVarArg,
                                const SmallVectorImpl<ISD::InputArg> &Ins,
@@ -735,6 +738,7 @@ private:
   SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerLOOP_DEPENDENCE_MASK(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
@@ -868,6 +872,12 @@ private:
                                          TargetLoweringOpt &TLO,
                                          unsigned Depth) const override;
 
+  bool canCreateUndefOrPoisonForTargetNode(SDValue Op,
+                                           const APInt &DemandedElts,
+                                           const SelectionDAG &DAG,
+                                           bool PoisonOnly, bool ConsiderFlags,
+                                           unsigned Depth) const override;
+
   bool isTargetCanonicalConstantNode(SDValue Op) const override;
 
   // With the exception of data-predicate transitions, no instructions are
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 178dab689739..8958ad129269 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -1327,6 +1327,8 @@ def move_vec_shift : Operand<i32> {
   let PrintMethod = "printShifter";
   let EncoderMethod = "getMoveVecShifterOpValue";
   let ParserMatchClass = MoveVecShifterOperand;
+  let OperandType = "OPERAND_SHIFT_MSL";
+  let OperandNamespace = "AArch64";
 }
 
 let DiagnosticType = "AddSubSecondSource" in {
@@ -3032,8 +3034,12 @@ class BaseAddSubEReg64<bit isSub, bit setFlags, RegisterClass dstRegtype,
 
 // Aliases for register+register add/subtract.
 class AddSubRegAlias<string asm, Instruction inst, RegisterClass dstRegtype,
-                     RegisterClass src1Regtype, RegisterClass src2Regtype,
-                     int shiftExt>
+                     RegisterClass src1Regtype, dag src2>
+    : InstAlias<asm#"\t$dst, $src1, $src2",
+                (inst dstRegtype:$dst, src1Regtype:$src1, src2)>;
+class AddSubRegAlias64<string asm, Instruction inst, RegisterClass dstRegtype,
+                       RegisterClass src1Regtype, RegisterClass src2Regtype,
+                       int shiftExt>
     : InstAlias<asm#"\t$dst, $src1, $src2",
                 (inst dstRegtype:$dst, src1Regtype:$src1, src2Regtype:$src2,
                       shiftExt)>;
@@ -3101,22 +3107,22 @@ multiclass AddSub<bit isSub, string mnemonic, string alias,
 
   // Register/register aliases with no shift when SP is not used.
   def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
-                       GPR32, GPR32, GPR32, 0>;
+                       GPR32, GPR32, (arith_shifted_reg32 GPR32:$src2, 0)>;
   def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
-                       GPR64, GPR64, GPR64, 0>;
+                       GPR64, GPR64, (arith_shifted_reg64 GPR64:$src2, 0)>;
 
   // Register/register aliases with no shift when either the destination or
   // first source register is SP.
   def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
-                       GPR32sponly, GPR32sp, GPR32, 16>; // UXTW #0
+                       GPR32sponly, GPR32sp,
+                       (arith_extended_reg32_i32 GPR32:$src2, 16)>; // UXTW #0
   def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
-                       GPR32sp, GPR32sponly, GPR32, 16>; // UXTW #0
-  def : AddSubRegAlias<mnemonic,
-                       !cast<Instruction>(NAME#"Xrx64"),
-                       GPR64sponly, GPR64sp, GPR64, 24>; // UXTX #0
-  def : AddSubRegAlias<mnemonic,
-                       !cast<Instruction>(NAME#"Xrx64"),
-                       GPR64sp, GPR64sponly, GPR64, 24>; // UXTX #0
+                       GPR32sp, GPR32sponly,
+                       (arith_extended_reg32_i32 GPR32:$src2, 16)>; // UXTW #0
+  def : AddSubRegAlias64<mnemonic, !cast<Instruction>(NAME#"Xrx64"),
+                         GPR64sponly, GPR64sp, GPR64, 24>;          // UXTX #0
+  def : AddSubRegAlias64<mnemonic, !cast<Instruction>(NAME#"Xrx64"),
+                         GPR64sp, GPR64sponly, GPR64, 24>;          // UXTX #0
 }
 
 multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp,
@@ -3180,15 +3186,19 @@ multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp,
   def : InstAlias<cmp#"\t$src, $imm", (!cast<Instruction>(NAME#"Xri")
                   XZR, GPR64sp:$src, addsub_shifted_imm64:$imm), 5>;
   def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Wrx")
-                  WZR, GPR32sp:$src1, GPR32:$src2, arith_extend:$sh), 4>;
+                  WZR, GPR32sp:$src1,
+                  (arith_extended_reg32_i32 GPR32:$src2, arith_extend:$sh)), 4>;
   def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx")
-                  XZR, GPR64sp:$src1, GPR32:$src2, arith_extend:$sh), 4>;
+                  XZR, GPR64sp:$src1,
+                  (arith_extended_reg32_i64 GPR32:$src2, arith_extend:$sh)), 4>;
   def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx64")
                   XZR, GPR64sp:$src1, GPR64:$src2, arith_extendlsl64:$sh), 4>;
   def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Wrs")
-                  WZR, GPR32:$src1, GPR32:$src2, arith_shift32:$sh), 4>;
+                  WZR, GPR32:$src1,
+                  (arith_shifted_reg32 GPR32:$src2, arith_shift32:$sh)), 4>;
   def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrs")
-                  XZR, GPR64:$src1, GPR64:$src2, arith_shift64:$sh), 4>;
+                  XZR, GPR64:$src1,
+                  (arith_shifted_reg64 GPR64:$src2, arith_shift64:$sh)), 4>;
 
   // Support negative immediates, e.g. cmp Rn, -imm -> cmn Rn, imm
   def : InstSubst<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Wri")
@@ -3198,27 +3208,28 @@ multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp,
 
   // Compare shorthands
   def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Wrs")
-                  WZR, GPR32:$src1, GPR32:$src2, 0), 5>;
+                  WZR, GPR32:$src1, (arith_shifted_reg32 GPR32:$src2, 0)), 5>;
   def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Xrs")
-                  XZR, GPR64:$src1, GPR64:$src2, 0), 5>;
+                  XZR, GPR64:$src1, (arith_shifted_reg64 GPR64:$src2, 0)), 5>;
   def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Wrx")
-                  WZR, GPR32sponly:$src1, GPR32:$src2, 16), 5>;
+                  WZR, GPR32sponly:$src1,
+                  (arith_extended_reg32_i32 GPR32:$src2, 16)), 5>;
   def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Xrx64")
                   XZR, GPR64sponly:$src1, GPR64:$src2, 24), 5>;
 
   // Register/register aliases with no shift when SP is not used.
   def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
-                       GPR32, GPR32, GPR32, 0>;
+                       GPR32, GPR32, (arith_shifted_reg32 GPR32:$src2, 0)>;
   def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
-                       GPR64, GPR64, GPR64, 0>;
+                       GPR64, GPR64, (arith_shifted_reg64 GPR64:$src2, 0)>;
 
   // Register/register aliases with no shift when the first source register
   // is SP.
   def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
-                       GPR32, GPR32sponly, GPR32, 16>; // UXTW #0
-  def : AddSubRegAlias<mnemonic,
-                       !cast<Instruction>(NAME#"Xrx64"),
-                       GPR64, GPR64sponly, GPR64, 24>; // UXTX #0
+                       GPR32, GPR32sponly,
+                       (arith_extended_reg32_i32 GPR32:$src2, 16)>; // UXTW #0
+  def : AddSubRegAlias64<mnemonic, !cast<Instruction>(NAME#"Xrx64"),
+                         GPR64, GPR64sponly, GPR64, 24>;            // UXTX #0
 }
 
 class AddSubG<bit isSub, string asm_inst, SDPatternOperator OpNode>
@@ -3403,9 +3414,10 @@ class BaseLogicalSReg<bits<2> opc, bit N, RegisterClass regtype,
 }
 
 // Aliases for register+register logical instructions.
-class LogicalRegAlias<string asm, Instruction inst, RegisterClass regtype>
+class LogicalRegAlias<string asm, Instruction inst, RegisterClass regtype,
+                      dag op2>
     : InstAlias<asm#"\t$dst, $src1, $src2",
-                (inst regtype:$dst, regtype:$src1, regtype:$src2, 0)>;
+                (inst regtype:$dst, regtype:$src1, op2)>;
 
 multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode,
                       string Alias> {
@@ -3477,10 +3489,10 @@ multiclass LogicalReg<bits<2> opc, bit N, string mnemonic,
     let Inst{31} = 1;
   }
 
-  def : LogicalRegAlias<mnemonic,
-                        !cast<Instruction>(NAME#"Wrs"), GPR32>;
-  def : LogicalRegAlias<mnemonic,
-                        !cast<Instruction>(NAME#"Xrs"), GPR64>;
+  def : LogicalRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
+                        GPR32, (logical_shifted_reg32 GPR32:$src2, 0)>;
+  def : LogicalRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
+                        GPR64, (logical_shifted_reg64 GPR64:$src2, 0)>;
 }
 
 // Split from LogicalReg to allow setting NZCV Defs
@@ -3500,10 +3512,10 @@ multiclass LogicalRegS<bits<2> opc, bit N, string mnemonic,
   }
   } // Defs = [NZCV]
 
-  def : LogicalRegAlias<mnemonic,
-                        !cast<Instruction>(NAME#"Wrs"), GPR32>;
-  def : LogicalRegAlias<mnemonic,
-                        !cast<Instruction>(NAME#"Xrs"), GPR64>;
+  def : LogicalRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
+                        GPR32, (logical_shifted_reg32 GPR32:$src2, 0)>;
+  def : LogicalRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
+                        GPR64, (logical_shifted_reg64 GPR64:$src2, 0)>;
 }
 
 //---
@@ -3991,9 +4003,10 @@ class LoadStore8RO<bits<2> sz, bit V, bits<2> opc, string asm, dag ins,
   let Inst{4-0}   = Rt;
 }
 
-class ROInstAlias<string asm, DAGOperand regtype, Instruction INST>
+class ROInstAlias<string asm, DAGOperand regtype, Instruction INST,
+                  ro_extend ext>
   : InstAlias<asm # "\t$Rt, [$Rn, $Rm]",
-              (INST regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, 0, 0)>;
+              (INST regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, (ext 0, 0))>;
 
 multiclass Load8RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
                    string asm, ValueType Ty, SDPatternOperator loadop> {
@@ -4019,7 +4032,7 @@ multiclass Load8RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
     let Inst{13} = 0b1;
   }
 
-  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX"), ro_Xextend8>;
 }
 
 multiclass Store8RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
@@ -4044,7 +4057,7 @@ multiclass Store8RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
     let Inst{13} = 0b1;
   }
 
-  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX"), ro_Xextend8>;
 }
 
 class LoadStore16RO<bits<2> sz, bit V, bits<2> opc, string asm, dag ins,
@@ -4091,7 +4104,7 @@ multiclass Load16RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
     let Inst{13} = 0b1;
   }
 
-  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX"), ro_Xextend16>;
 }
 
 multiclass Store16RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
@@ -4116,7 +4129,7 @@ multiclass Store16RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
     let Inst{13} = 0b1;
   }
 
-  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX"), ro_Xextend16>;
 }
 
 class LoadStore32RO<bits<2> sz, bit V, bits<2> opc, string asm, dag ins,
@@ -4163,7 +4176,7 @@ multiclass Load32RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
     let Inst{13} = 0b1;
   }
 
-  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX"), ro_Xextend32>;
 }
 
 multiclass Store32RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
@@ -4188,7 +4201,7 @@ multiclass Store32RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
     let Inst{13} = 0b1;
   }
 
-  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX"), ro_Xextend32>;
 }
 
 class LoadStore64RO<bits<2> sz, bit V, bits<2> opc, string asm, dag ins,
@@ -4235,7 +4248,7 @@ multiclass Load64RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
     let Inst{13} = 0b1;
   }
 
-  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX"), ro_Xextend64>;
 }
 
 multiclass Store64RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
@@ -4260,7 +4273,7 @@ multiclass Store64RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
     let Inst{13} = 0b1;
   }
 
-  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX"), ro_Xextend64>;
 }
 
 class LoadStore128RO<bits<2> sz, bit V, bits<2> opc, string asm, dag ins,
@@ -4307,7 +4320,7 @@ multiclass Load128RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
     let Inst{13} = 0b1;
   }
 
-  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX"), ro_Xextend128>;
 }
 
 multiclass Store128RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
@@ -4328,7 +4341,7 @@ multiclass Store128RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
     let Inst{13} = 0b1;
   }
 
-  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX"), ro_Xextend128>;
 }
 
 let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
@@ -4377,9 +4390,7 @@ multiclass PrefetchRO<bits<2> sz, bit V, bits<2> opc, string asm> {
     let Inst{13} = 0b1;
   }
 
-  def : InstAlias<"prfm $Rt, [$Rn, $Rm]",
-               (!cast<Instruction>(NAME # "roX") prfop:$Rt,
-                                                 GPR64sp:$Rn, GPR64:$Rm, 0, 0)>;
+  def : ROInstAlias<"prfm", prfop, !cast<Instruction>(NAME # "roX"), ro_Xextend64>;
 }
 
 //---
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index db028b4b7677..e56fe90259d5 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -91,8 +91,8 @@ static cl::opt<unsigned> GatherOptSearchLimit(
              "machine-combiner gather pattern optimization"));
 
 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
-    : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
-                          AArch64::CATCHRET),
+    : AArch64GenInstrInfo(STI, AArch64::ADJCALLSTACKDOWN,
+                          AArch64::ADJCALLSTACKUP, AArch64::CATCHRET),
       RI(STI.getTargetTriple(), STI.getHwMode()), Subtarget(STI) {}
 
 /// GetInstSize - Return the number of bytes of code the specified
@@ -1299,6 +1299,7 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
     break;
   case AArch64::PTEST_PP:
   case AArch64::PTEST_PP_ANY:
+  case AArch64::PTEST_PP_FIRST:
     SrcReg = MI.getOperand(0).getReg();
     SrcReg2 = MI.getOperand(1).getReg();
     if (MI.getOperand(2).getSubReg())
@@ -1691,7 +1692,8 @@ bool AArch64InstrInfo::optimizeCompareInstr(
   }
 
   if (CmpInstr.getOpcode() == AArch64::PTEST_PP ||
-      CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY)
+      CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY ||
+      CmpInstr.getOpcode() == AArch64::PTEST_PP_FIRST)
     return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI);
 
   if (SrcReg2 != 0)
@@ -5075,7 +5077,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
             .addImm(0)
             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
       }
-    } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
+    } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGPR32()) {
       BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
           .addImm(0)
           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
@@ -5202,7 +5204,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
           .addReg(SrcReg, getKillRegState(KillSrc))
           .addImm(0)
           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
-    } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
+    } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGPR64()) {
       BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
           .addImm(0)
           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
@@ -5318,15 +5320,49 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   if (AArch64::FPR64RegClass.contains(DestReg) &&
       AArch64::FPR64RegClass.contains(SrcReg)) {
-    BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
+    if (Subtarget.hasZeroCycleRegMoveFPR128() &&
+        !Subtarget.hasZeroCycleRegMoveFPR64() &&
+        !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable()) {
+      const TargetRegisterInfo *TRI = &getRegisterInfo();
+      MCRegister DestRegQ = TRI->getMatchingSuperReg(DestReg, AArch64::dsub,
+                                                     &AArch64::FPR128RegClass);
+      MCRegister SrcRegQ = TRI->getMatchingSuperReg(SrcReg, AArch64::dsub,
+                                                    &AArch64::FPR128RegClass);
+      // This instruction is reading and writing Q registers. This may upset
+      // the register scavenger and machine verifier, so we need to indicate
+      // that we are reading an undefined value from SrcRegQ, but a proper
+      // value from SrcReg.
+      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
+          .addReg(SrcRegQ, RegState::Undef)
+          .addReg(SrcRegQ, RegState::Undef)
+          .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+    } else {
+      BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    }
     return;
   }
 
   if (AArch64::FPR32RegClass.contains(DestReg) &&
       AArch64::FPR32RegClass.contains(SrcReg)) {
-    if (Subtarget.hasZeroCycleRegMoveFPR64() &&
-        !Subtarget.hasZeroCycleRegMoveFPR32()) {
+    if (Subtarget.hasZeroCycleRegMoveFPR128() &&
+        !Subtarget.hasZeroCycleRegMoveFPR64() &&
+        !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable()) {
+      const TargetRegisterInfo *TRI = &getRegisterInfo();
+      MCRegister DestRegQ = TRI->getMatchingSuperReg(DestReg, AArch64::ssub,
+                                                     &AArch64::FPR128RegClass);
+      MCRegister SrcRegQ = TRI->getMatchingSuperReg(SrcReg, AArch64::ssub,
+                                                    &AArch64::FPR128RegClass);
+      // This instruction is reading and writing Q registers. This may upset
+      // the register scavenger and machine verifier, so we need to indicate
+      // that we are reading an undefined value from SrcRegQ, but a proper
+      // value from SrcReg.
+      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
+          .addReg(SrcRegQ, RegState::Undef)
+          .addReg(SrcRegQ, RegState::Undef)
+          .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+    } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
+               !Subtarget.hasZeroCycleRegMoveFPR32()) {
       const TargetRegisterInfo *TRI = &getRegisterInfo();
       MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::ssub,
                                                      &AArch64::FPR64RegClass);
@@ -5348,8 +5384,24 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   if (AArch64::FPR16RegClass.contains(DestReg) &&
       AArch64::FPR16RegClass.contains(SrcReg)) {
-    if (Subtarget.hasZeroCycleRegMoveFPR64() &&
-        !Subtarget.hasZeroCycleRegMoveFPR32()) {
+    if (Subtarget.hasZeroCycleRegMoveFPR128() &&
+        !Subtarget.hasZeroCycleRegMoveFPR64() &&
+        !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable()) {
+      const TargetRegisterInfo *TRI = &getRegisterInfo();
+      MCRegister DestRegQ = TRI->getMatchingSuperReg(DestReg, AArch64::hsub,
+                                                     &AArch64::FPR128RegClass);
+      MCRegister SrcRegQ = TRI->getMatchingSuperReg(SrcReg, AArch64::hsub,
+                                                    &AArch64::FPR128RegClass);
+      // This instruction is reading and writing Q registers. This may upset
+      // the register scavenger and machine verifier, so we need to indicate
+      // that we are reading an undefined value from SrcRegQ, but a proper
+      // value from SrcReg.
+      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
+          .addReg(SrcRegQ, RegState::Undef)
+          .addReg(SrcRegQ, RegState::Undef)
+          .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+    } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
+               !Subtarget.hasZeroCycleRegMoveFPR32()) {
       const TargetRegisterInfo *TRI = &getRegisterInfo();
       MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::hsub,
                                                      &AArch64::FPR64RegClass);
@@ -5375,8 +5427,24 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   if (AArch64::FPR8RegClass.contains(DestReg) &&
       AArch64::FPR8RegClass.contains(SrcReg)) {
-    if (Subtarget.hasZeroCycleRegMoveFPR64() &&
-        !Subtarget.hasZeroCycleRegMoveFPR32()) {
+    if (Subtarget.hasZeroCycleRegMoveFPR128() &&
+        !Subtarget.hasZeroCycleRegMoveFPR64() &&
+        !Subtarget.hasZeroCycleRegMoveFPR64() && Subtarget.isNeonAvailable()) {
+      const TargetRegisterInfo *TRI = &getRegisterInfo();
+      MCRegister DestRegQ = TRI->getMatchingSuperReg(DestReg, AArch64::bsub,
+                                                     &AArch64::FPR128RegClass);
+      MCRegister SrcRegQ = TRI->getMatchingSuperReg(SrcReg, AArch64::bsub,
+                                                    &AArch64::FPR128RegClass);
+      // This instruction is reading and writing Q registers. This may upset
+      // the register scavenger and machine verifier, so we need to indicate
+      // that we are reading an undefined value from SrcRegQ, but a proper
+      // value from SrcReg.
+      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
+          .addReg(SrcRegQ, RegState::Undef)
+          .addReg(SrcRegQ, RegState::Undef)
+          .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+    } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
+               !Subtarget.hasZeroCycleRegMoveFPR32()) {
       const TargetRegisterInfo *TRI = &getRegisterInfo();
       MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::bsub,
                                                      &AArch64::FPR64RegClass);
@@ -5403,8 +5471,12 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   // Copies between GPR64 and FPR64.
   if (AArch64::FPR64RegClass.contains(DestReg) &&
       AArch64::GPR64RegClass.contains(SrcReg)) {
-    BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
+    if (AArch64::XZR == SrcReg) {
+      BuildMI(MBB, I, DL, get(AArch64::FMOVD0), DestReg);
+    } else {
+      BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    }
     return;
   }
   if (AArch64::GPR64RegClass.contains(DestReg) &&
@@ -5416,8 +5488,12 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   // Copies between GPR32 and FPR32.
   if (AArch64::FPR32RegClass.contains(DestReg) &&
       AArch64::GPR32RegClass.contains(SrcReg)) {
-    BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
+    if (AArch64::WZR == SrcReg) {
+      BuildMI(MBB, I, DL, get(AArch64::FMOVS0), DestReg);
+    } else {
+      BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    }
     return;
   }
   if (AArch64::GPR32RegClass.contains(DestReg) &&
@@ -6652,7 +6728,7 @@ static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
   if (MO.isReg() && MO.getReg().isVirtual())
     MI = MRI.getUniqueVRegDef(MO.getReg());
   // And it needs to be in the trace (otherwise, it won't have a depth).
-  if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
+  if (!MI || MI->getParent() != &MBB || MI->getOpcode() != CombineOpc)
     return false;
   // Must only used by the user we combine with.
   if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 89f88776d832..f0020a9a3c91 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -989,6 +989,17 @@ def AArch64fcvtxnv: PatFrags<(ops node:$Rn),
                                  [(int_aarch64_neon_fcvtxn node:$Rn),
                                   (AArch64fcvtxn_n node:$Rn)]>;
 
+def AArch64fcvtzs_half : SDNode<"AArch64ISD::FCVTZS_HALF", SDTFPExtendOp>;
+def AArch64fcvtzu_half : SDNode<"AArch64ISD::FCVTZU_HALF", SDTFPExtendOp>;
+def AArch64fcvtas_half : SDNode<"AArch64ISD::FCVTAS_HALF", SDTFPExtendOp>;
+def AArch64fcvtau_half : SDNode<"AArch64ISD::FCVTAU_HALF", SDTFPExtendOp>;
+def AArch64fcvtms_half : SDNode<"AArch64ISD::FCVTMS_HALF", SDTFPExtendOp>;
+def AArch64fcvtmu_half : SDNode<"AArch64ISD::FCVTMU_HALF", SDTFPExtendOp>;
+def AArch64fcvtns_half : SDNode<"AArch64ISD::FCVTNS_HALF", SDTFPExtendOp>;
+def AArch64fcvtnu_half : SDNode<"AArch64ISD::FCVTNU_HALF", SDTFPExtendOp>;
+def AArch64fcvtps_half : SDNode<"AArch64ISD::FCVTPS_HALF", SDTFPExtendOp>;
+def AArch64fcvtpu_half : SDNode<"AArch64ISD::FCVTPU_HALF", SDTFPExtendOp>;
+
 //def Aarch64softf32tobf16v8: SDNode<"AArch64ISD::", SDTFPRoundOp>;
 
 // Vector immediate ops
@@ -2155,7 +2166,7 @@ let Predicates = [HasPAuth] in {
                             i64imm:$Disc, GPR64:$AddrDisc),
                        [], "$AuthVal = $Val">, Sched<[WriteI, ReadI]> {
     let isCodeGenOnly = 1;
-    let hasSideEffects = 0;
+    let hasSideEffects = 1;
     let mayStore = 0;
     let mayLoad = 0;
     let Size = 32;
@@ -2660,13 +2671,17 @@ defm ADD : AddSub<0, "add", "sub", add>;
 defm SUB : AddSub<1, "sub", "add">;
 
 def : InstAlias<"mov $dst, $src",
-                (ADDWri GPR32sponly:$dst, GPR32sp:$src, 0, 0)>;
+                (ADDWri GPR32sponly:$dst, GPR32sp:$src,
+                        (addsub_shifted_imm32 0, 0))>;
 def : InstAlias<"mov $dst, $src",
-                (ADDWri GPR32sp:$dst, GPR32sponly:$src, 0, 0)>;
+                (ADDWri GPR32sp:$dst, GPR32sponly:$src,
+                        (addsub_shifted_imm32 0, 0))>;
 def : InstAlias<"mov $dst, $src",
-                (ADDXri GPR64sponly:$dst, GPR64sp:$src, 0, 0)>;
+                (ADDXri GPR64sponly:$dst, GPR64sp:$src,
+                        (addsub_shifted_imm64 0, 0))>;
 def : InstAlias<"mov $dst, $src",
-                (ADDXri GPR64sp:$dst, GPR64sponly:$src, 0, 0)>;
+                (ADDXri GPR64sp:$dst, GPR64sponly:$src,
+                        (addsub_shifted_imm64 0, 0))>;
 
 defm ADDS : AddSubS<0, "adds", AArch64add_flag, "cmn", "subs", "cmp">;
 defm SUBS : AddSubS<1, "subs", AArch64sub_flag, "cmp", "adds", "cmn">;
@@ -2726,19 +2741,31 @@ def : Pat<(AArch64sub_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
           (ADDSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
 }
 
-def : InstAlias<"neg $dst, $src", (SUBWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>;
-def : InstAlias<"neg $dst, $src", (SUBXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>;
+def : InstAlias<"neg $dst, $src",
+                (SUBWrs GPR32:$dst, WZR,
+                        (arith_shifted_reg32 GPR32:$src, 0)), 3>;
+def : InstAlias<"neg $dst, $src",
+                (SUBXrs GPR64:$dst, XZR,
+                        (arith_shifted_reg64 GPR64:$src, 0)), 3>;
 def : InstAlias<"neg $dst, $src$shift",
-                (SUBWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>;
+                (SUBWrs GPR32:$dst, WZR,
+                        (arith_shifted_reg32 GPR32:$src, arith_shift32:$shift)), 2>;
 def : InstAlias<"neg $dst, $src$shift",
-                (SUBXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>;
-
-def : InstAlias<"negs $dst, $src", (SUBSWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>;
-def : InstAlias<"negs $dst, $src", (SUBSXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>;
+                (SUBXrs GPR64:$dst, XZR,
+                        (arith_shifted_reg64 GPR64:$src, arith_shift64:$shift)), 2>;
+
+def : InstAlias<"negs $dst, $src",
+                (SUBSWrs GPR32:$dst, WZR,
+                         (arith_shifted_reg32 GPR32:$src, 0)), 3>;
+def : InstAlias<"negs $dst, $src",
+                (SUBSXrs GPR64:$dst, XZR,
+                         (arith_shifted_reg64 GPR64:$src, 0)), 3>;
 def : InstAlias<"negs $dst, $src$shift",
-                (SUBSWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>;
+                (SUBSWrs GPR32:$dst, WZR,
+                         (arith_shifted_reg32 GPR32:$src, arith_shift32:$shift)), 2>;
 def : InstAlias<"negs $dst, $src$shift",
-                (SUBSXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>;
+                (SUBSXrs GPR64:$dst, XZR,
+                         (arith_shifted_reg64 GPR64:$src, arith_shift64:$shift)), 2>;
 
 
 // Unsigned/Signed divide
@@ -3165,16 +3192,26 @@ defm ORN  : LogicalReg<0b01, 1, "orn",
                        BinOpFrag<(or node:$LHS, (not node:$RHS))>>;
 defm ORR  : LogicalReg<0b01, 0, "orr", or>;
 
-def : InstAlias<"mov $dst, $src", (ORRWrs GPR32:$dst, WZR, GPR32:$src, 0), 2>;
-def : InstAlias<"mov $dst, $src", (ORRXrs GPR64:$dst, XZR, GPR64:$src, 0), 2>;
-
-def : InstAlias<"mvn $Wd, $Wm", (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, 0), 3>;
-def : InstAlias<"mvn $Xd, $Xm", (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, 0), 3>;
+def : InstAlias<"mov $dst, $src",
+                (ORRWrs GPR32:$dst, WZR,
+                        (logical_shifted_reg32 GPR32:$src, 0)), 2>;
+def : InstAlias<"mov $dst, $src", 
+                (ORRXrs GPR64:$dst, XZR,
+                        (logical_shifted_reg64 GPR64:$src, 0)), 2>;
+
+def : InstAlias<"mvn $Wd, $Wm",
+                (ORNWrs GPR32:$Wd, WZR,
+                        (logical_shifted_reg32 GPR32:$Wm, 0)), 3>;
+def : InstAlias<"mvn $Xd, $Xm",
+                (ORNXrs GPR64:$Xd, XZR,
+                        (logical_shifted_reg64 GPR64:$Xm, 0)), 3>;
 
 def : InstAlias<"mvn $Wd, $Wm$sh",
-                (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, logical_shift32:$sh), 2>;
+                (ORNWrs GPR32:$Wd, WZR,
+                        (logical_shifted_reg32 GPR32:$Wm, logical_shift32:$sh)), 2>;
 def : InstAlias<"mvn $Xd, $Xm$sh",
-                (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, logical_shift64:$sh), 2>;
+                (ORNXrs GPR64:$Xd, XZR,
+                        (logical_shifted_reg64 GPR64:$Xm, logical_shift64:$sh)), 2>;
 
 def : InstAlias<"tst $src1, $src2",
                 (ANDSWri WZR, GPR32:$src1, logical_imm32:$src2), 2>;
@@ -3182,14 +3219,18 @@ def : InstAlias<"tst $src1, $src2",
                 (ANDSXri XZR, GPR64:$src1, logical_imm64:$src2), 2>;
 
 def : InstAlias<"tst $src1, $src2",
-                        (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, 0), 3>;
+                (ANDSWrs WZR, GPR32:$src1,
+                         (logical_shifted_reg32 GPR32:$src2, 0)), 3>;
 def : InstAlias<"tst $src1, $src2",
-                        (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, 0), 3>;
+                (ANDSXrs XZR, GPR64:$src1,
+                         (logical_shifted_reg64 GPR64:$src2, 0)), 3>;
 
 def : InstAlias<"tst $src1, $src2$sh",
-               (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, logical_shift32:$sh), 2>;
+                (ANDSWrs WZR, GPR32:$src1,
+                         (logical_shifted_reg32 GPR32:$src2, logical_shift32:$sh)), 2>;
 def : InstAlias<"tst $src1, $src2$sh",
-               (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, logical_shift64:$sh), 2>;
+                (ANDSXrs XZR, GPR64:$src1,
+                         (logical_shifted_reg64 GPR64:$src2, logical_shift64:$sh)), 2>;
 
 
 def : Pat<(not GPR32:$Wm), (ORNWrr WZR, GPR32:$Wm)>;
@@ -4710,6 +4751,26 @@ let Predicates = [IsLE] in {
             (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
 }
 
+// truncstorei32 of f64 bitcasted to i64
+def : Pat<(truncstorei32 (i64 (bitconvert (f64 FPR64:$Rt))), (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)),
+          (STRSui (EXTRACT_SUBREG FPR64:$Rt, ssub), GPR64sp:$Rn, uimm12s4:$offset)>;
+
+// truncstorei16 of f64 bitcasted to i64
+def : Pat<(truncstorei16 (i64 (bitconvert (f64 FPR64:$Rt))), (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)),
+          (STRHui (f16 (EXTRACT_SUBREG FPR64:$Rt, hsub)), GPR64sp:$Rn, uimm12s2:$offset)>;      
+
+  // truncstorei16 of f32 bitcasted to i32
+def : Pat<(truncstorei16 (i32 (bitconvert (f32 FPR32:$Rt))), (am_indexed16 GPR64sp:$Rn, uimm12s2:$off)),
+          (STRHui (f16 (EXTRACT_SUBREG FPR32:$Rt, hsub)), GPR64sp:$Rn, uimm12s2:$off)>;
+
+  // truncstorei8 of f64 bitcasted to i64
+def : Pat<(truncstorei8 (i64 (bitconvert (f64 FPR64:$Rt))), (am_indexed8 GPR64sp:$Rn, uimm12s1:$off)),
+          (STRBui (aarch64mfp8 (EXTRACT_SUBREG FPR64:$Rt, bsub)), GPR64sp:$Rn, uimm12s1:$off)>;
+
+  // truncstorei8 of f32 bitcasted to i32
+def : Pat<(truncstorei8 (i32 (bitconvert (f32 FPR32:$Rt))), (am_indexed8 GPR64sp:$Rn, uimm12s1:$off)),
+          (STRBui (aarch64mfp8 (EXTRACT_SUBREG FPR32:$Rt, bsub)), GPR64sp:$Rn, uimm12s1:$off)>;
+
 // truncstore i64
 def : Pat<(truncstorei32 GPR64:$Rt,
                          (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)),
@@ -6536,9 +6597,33 @@ defm UQXTN  : SIMDTwoScalarMixedBHS<1, 0b10100, "uqxtn", int_aarch64_neon_scalar
 defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd",
                                     int_aarch64_neon_usqadd>;
 
+// f16 -> s16 conversions
+let Predicates = [HasFullFP16] in {
+  def : Pat<(i16(fp_to_sint_sat_gi f16:$Rn)), (FCVTZSv1f16 f16:$Rn)>;
+  def : Pat<(i16(fp_to_uint_sat_gi f16:$Rn)), (FCVTZUv1f16 f16:$Rn)>;
+}
+
 def : Pat<(v1i64 (AArch64vashr (v1i64 V64:$Rn), (i32 63))),
           (CMLTv1i64rz V64:$Rn)>;
 
+// f16 -> i16 conversions leave the bit pattern in a f32
+class F16ToI16ScalarPat<SDNode cvt_isd, BaseSIMDTwoScalar instr>
+    : Pat<(f32 (cvt_isd (f16 FPR16:$Rn))),
+     (f32 (SUBREG_TO_REG (i64 0), (instr FPR16:$Rn), hsub))>;
+
+let Predicates = [HasFullFP16] in {
+def : F16ToI16ScalarPat<AArch64fcvtzs_half, FCVTZSv1f16>;
+def : F16ToI16ScalarPat<AArch64fcvtzu_half, FCVTZUv1f16>;
+def : F16ToI16ScalarPat<AArch64fcvtas_half, FCVTASv1f16>;
+def : F16ToI16ScalarPat<AArch64fcvtau_half, FCVTAUv1f16>;
+def : F16ToI16ScalarPat<AArch64fcvtms_half, FCVTMSv1f16>;
+def : F16ToI16ScalarPat<AArch64fcvtmu_half, FCVTMUv1f16>;
+def : F16ToI16ScalarPat<AArch64fcvtns_half, FCVTNSv1f16>;
+def : F16ToI16ScalarPat<AArch64fcvtnu_half, FCVTNUv1f16>;
+def : F16ToI16ScalarPat<AArch64fcvtps_half, FCVTPSv1f16>;
+def : F16ToI16ScalarPat<AArch64fcvtpu_half, FCVTPUv1f16>;
+}
+
 // Round FP64 to BF16.
 let Predicates = [HasNEONandIsStreamingSafe, HasBF16] in
 def : Pat<(bf16 (any_fpround (f64 FPR64:$Rn))),
@@ -6641,20 +6726,24 @@ def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
 // Some float -> int -> float conversion patterns for which we want to keep the
 // int values in FP registers using the corresponding NEON instructions to
 // avoid more costly int <-> fp register transfers.
+let HasOneUse = 1 in {
+def any_fp_to_sint_oneuse: PatFrag<(ops node:$src0), (any_fp_to_sint $src0)>;
+def any_fp_to_uint_oneuse: PatFrag<(ops node:$src0), (any_fp_to_uint $src0)>;
+}
 let Predicates = [HasNEONandIsSME2p2StreamingSafe] in {
-def : Pat<(f64 (any_sint_to_fp (i64 (any_fp_to_sint f64:$Rn)))),
+def : Pat<(f64 (any_sint_to_fp (i64 (any_fp_to_sint_oneuse f64:$Rn)))),
           (SCVTFv1i64 (i64 (FCVTZSv1i64 f64:$Rn)))>;
-def : Pat<(f32 (any_sint_to_fp (i32 (any_fp_to_sint f32:$Rn)))),
+def : Pat<(f32 (any_sint_to_fp (i32 (any_fp_to_sint_oneuse f32:$Rn)))),
           (SCVTFv1i32 (i32 (FCVTZSv1i32 f32:$Rn)))>;
-def : Pat<(f64 (any_uint_to_fp (i64 (any_fp_to_uint f64:$Rn)))),
+def : Pat<(f64 (any_uint_to_fp (i64 (any_fp_to_uint_oneuse f64:$Rn)))),
           (UCVTFv1i64 (i64 (FCVTZUv1i64 f64:$Rn)))>;
-def : Pat<(f32 (any_uint_to_fp (i32 (any_fp_to_uint f32:$Rn)))),
+def : Pat<(f32 (any_uint_to_fp (i32 (any_fp_to_uint_oneuse f32:$Rn)))),
           (UCVTFv1i32 (i32 (FCVTZUv1i32 f32:$Rn)))>;
 
 let Predicates = [HasNEONandIsSME2p2StreamingSafe, HasFullFP16] in {
-def : Pat<(f16 (any_sint_to_fp (i32 (any_fp_to_sint f16:$Rn)))),
+def : Pat<(f16 (any_sint_to_fp (i32 (any_fp_to_sint_oneuse f16:$Rn)))),
           (SCVTFv1i16 (f16 (FCVTZSv1f16 f16:$Rn)))>;
-def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))),
+def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint_oneuse f16:$Rn)))),
           (UCVTFv1i16 (f16 (FCVTZUv1f16 f16:$Rn)))>;
 }
 
@@ -8234,6 +8323,29 @@ def MVNIv4s_msl   : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s",
                             (AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
 }
 
+// SABA patterns for add(x, abs(y)) -> saba(x, y, 0)
+def : Pat<(v8i8 (add V64:$Vn, (abs V64:$Vm))),
+          (SABAv8i8 V64:$Vn, V64:$Vm, (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub))>;
+def : Pat<(v4i16 (add V64:$Vn, (abs V64:$Vm))),
+          (SABAv4i16 V64:$Vn, V64:$Vm, (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub))>;
+def : Pat<(v2i32 (add V64:$Vn, (abs V64:$Vm))),
+          (SABAv2i32 V64:$Vn, V64:$Vm, (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub))>;
+def : Pat<(v16i8 (add V128:$Vn, (abs V128:$Vm))),
+          (SABAv16i8 V128:$Vn, V128:$Vm, (MOVIv2d_ns (i32 0)))>;
+def : Pat<(v8i16 (add V128:$Vn, (abs V128:$Vm))),
+          (SABAv8i16 V128:$Vn, V128:$Vm, (MOVIv2d_ns (i32 0)))>;
+def : Pat<(v4i32 (add V128:$Vn, (abs V128:$Vm))),
+          (SABAv4i32 V128:$Vn, V128:$Vm, (MOVIv2d_ns (i32 0)))>;
+
+// SABAL patterns for add(x, zext(abs(y))) -> sabal(x, y, 0)
+def : Pat<(v8i16 (add V128:$Vn, (zext (abs (v8i8 V64:$Vm))))),
+          (SABALv8i8_v8i16 V128:$Vn, V64:$Vm, (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub))>;
+def : Pat<(v4i32 (add V128:$Vn, (zext (abs (v4i16 V64:$Vm))))),
+          (SABALv4i16_v4i32 V128:$Vn, V64:$Vm, (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub))>;
+def : Pat<(v2i64 (add V128:$Vn, (zext (abs (v2i32 V64:$Vm))))),
+          (SABALv2i32_v2i64 V128:$Vn, V64:$Vm, (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub))>;
+
+
 //----------------------------------------------------------------------------
 // AdvSIMD indexed element
 //----------------------------------------------------------------------------
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 1fde87e65a34..993cff112ba8 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -228,9 +228,6 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   // on function entry to record the initial pstate of a function.
   Register PStateSMReg = MCRegister::NoRegister;
 
-  // true if PStateSMReg is used.
-  bool PStateSMRegUsed = false;
-
   // Has the PNReg used to build PTRUE instruction.
   // The PTRUE is used for the LD/ST of ZReg pairs in save and restore.
   unsigned PredicateRegForFillSpill = 0;
@@ -238,6 +235,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   // Holds the SME function attributes (streaming mode, ZA/ZT0 state).
   SMEAttrs SMEFnAttrs;
 
+  // Holds the TPIDR2 block if allocated early (for Windows/stack probes
+  // support).
+  Register EarlyAllocSMESaveBuffer = AArch64::NoRegister;
+
   // Note: The following properties are only used for the old SME ABI lowering:
   /// The frame-index for the TPIDR2 object used for lazy saves.
   TPIDR2Object TPIDR2;
@@ -256,6 +257,14 @@ public:
         const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
       const override;
 
+  void setEarlyAllocSMESaveBuffer(Register Ptr) {
+    EarlyAllocSMESaveBuffer = Ptr;
+  }
+
+  Register getEarlyAllocSMESaveBuffer() const {
+    return EarlyAllocSMESaveBuffer;
+  }
+
   // Old SME ABI lowering state getters/setters:
   Register getSMESaveBufferAddr() const { return SMESaveBufferAddr; };
   void setSMESaveBufferAddr(Register Reg) { SMESaveBufferAddr = Reg; };
@@ -273,9 +282,6 @@ public:
   Register getPStateSMReg() const { return PStateSMReg; };
   void setPStateSMReg(Register Reg) { PStateSMReg = Reg; };
 
-  unsigned isPStateSMRegUsed() const { return PStateSMRegUsed; };
-  void setPStateSMRegUsed(bool Used = true) { PStateSMRegUsed = Used; };
-
   bool isSVECC() const { return IsSVECC; };
   void setIsSVECC(bool s) { IsSVECC = s; };
 
diff --git a/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp b/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
index ff7a0d1faedf..f4a7f774d477 100644
--- a/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
@@ -237,8 +237,8 @@ static bool isAddressLdStPair(const MachineInstr *FirstMI,
 }
 
 /// Compare and conditional select.
-static bool isCCSelectPair(const MachineInstr *FirstMI,
-                           const MachineInstr &SecondMI) {
+static bool isCmpCSelPair(const MachineInstr *FirstMI,
+                          const MachineInstr &SecondMI) {
   // 32 bits
   if (SecondMI.getOpcode() == AArch64::CSELWr) {
     // Assume the 1st instr to be a wildcard if it is unspecified.
@@ -279,6 +279,40 @@ static bool isCCSelectPair(const MachineInstr *FirstMI,
   return false;
 }
 
+/// Compare and cset.
+static bool isCmpCSetPair(const MachineInstr *FirstMI,
+                          const MachineInstr &SecondMI) {
+  if ((SecondMI.getOpcode() == AArch64::CSINCWr &&
+       SecondMI.getOperand(1).getReg() == AArch64::WZR &&
+       SecondMI.getOperand(2).getReg() == AArch64::WZR) ||
+      (SecondMI.getOpcode() == AArch64::CSINCXr &&
+       SecondMI.getOperand(1).getReg() == AArch64::XZR &&
+       SecondMI.getOperand(2).getReg() == AArch64::XZR)) {
+    // Assume the 1st instr to be a wildcard if it is unspecified.
+    if (FirstMI == nullptr)
+      return true;
+
+    if (FirstMI->definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
+        FirstMI->definesRegister(AArch64::XZR, /*TRI=*/nullptr))
+      switch (FirstMI->getOpcode()) {
+      case AArch64::SUBSWrs:
+      case AArch64::SUBSXrs:
+        return !AArch64InstrInfo::hasShiftedReg(*FirstMI);
+      case AArch64::SUBSWrx:
+      case AArch64::SUBSXrx:
+      case AArch64::SUBSXrx64:
+        return !AArch64InstrInfo::hasExtendedReg(*FirstMI);
+      case AArch64::SUBSWri:
+      case AArch64::SUBSWrr:
+      case AArch64::SUBSXri:
+      case AArch64::SUBSXrr:
+        return true;
+      }
+  }
+
+  return false;
+}
+
 // Arithmetic and logic.
 static bool isArithmeticLogicPair(const MachineInstr *FirstMI,
                                   const MachineInstr &SecondMI) {
@@ -465,7 +499,9 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
     return true;
   if (ST.hasFuseAddress() && isAddressLdStPair(FirstMI, SecondMI))
     return true;
-  if (ST.hasFuseCCSelect() && isCCSelectPair(FirstMI, SecondMI))
+  if (ST.hasFuseCmpCSel() && isCmpCSelPair(FirstMI, SecondMI))
+    return true;
+  if (ST.hasFuseCmpCSet() && isCmpCSetPair(FirstMI, SecondMI))
     return true;
   if (ST.hasFuseArithmeticLogic() && isArithmeticLogicPair(FirstMI, SecondMI))
     return true;
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index 42eaeca906e6..81f5d075729d 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -134,6 +134,8 @@ def TuneA78 : SubtargetFeature<"a78", "ARMProcFamily", "CortexA78",
                                FeatureCmpBccFusion,
                                FeatureFuseAES,
                                FeatureFuseAdrpAdd,
+                               FeatureFuseCmpCSel,
+                               FeatureFuseCmpCSet,
                                FeatureAddrLSLSlow14,
                                FeatureALULSLFast,
                                FeaturePostRAScheduler,
@@ -146,6 +148,8 @@ def TuneA78AE : SubtargetFeature<"a78ae", "ARMProcFamily",
                                  FeatureCmpBccFusion,
                                  FeatureFuseAES,
                                  FeatureFuseAdrpAdd,
+                                 FeatureFuseCmpCSel,
+                                 FeatureFuseCmpCSet,
                                  FeatureAddrLSLSlow14,
                                  FeatureALULSLFast,
                                  FeaturePostRAScheduler,
@@ -158,6 +162,8 @@ def TuneA78C : SubtargetFeature<"a78c", "ARMProcFamily",
                                 FeatureCmpBccFusion,
                                 FeatureFuseAES,
                                 FeatureFuseAdrpAdd,
+                                FeatureFuseCmpCSel,
+                                FeatureFuseCmpCSet,
                                 FeatureAddrLSLSlow14,
                                 FeatureALULSLFast,
                                 FeaturePostRAScheduler,
@@ -169,6 +175,8 @@ def TuneA710    : SubtargetFeature<"a710", "ARMProcFamily", "CortexA710",
                                    FeatureCmpBccFusion,
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
+                                   FeatureFuseCmpCSel,
+                                   FeatureFuseCmpCSet,
                                    FeatureALULSLFast,
                                    FeaturePostRAScheduler,
                                    FeatureEnableSelectOptimize,
@@ -181,6 +189,8 @@ def TuneA715 : SubtargetFeature<"a715", "ARMProcFamily", "CortexA715",
                                  FeatureCmpBccFusion,
                                  FeatureALULSLFast,
                                  FeatureFuseAdrpAdd,
+                                 FeatureFuseCmpCSel,
+                                 FeatureFuseCmpCSet,
                                  FeatureEnableSelectOptimize,
                                  FeaturePredictableSelectIsExpensive]>;
 
@@ -191,6 +201,8 @@ def TuneA720 : SubtargetFeature<"a720", "ARMProcFamily", "CortexA720",
                                  FeatureCmpBccFusion,
                                  FeatureALULSLFast,
                                  FeatureFuseAdrpAdd,
+                                 FeatureFuseCmpCSel,
+                                 FeatureFuseCmpCSet,
                                  FeatureEnableSelectOptimize,
                                  FeaturePredictableSelectIsExpensive]>;
 
@@ -201,6 +213,8 @@ def TuneA720AE : SubtargetFeature<"a720ae", "ARMProcFamily", "CortexA720",
                                  FeatureCmpBccFusion,
                                  FeatureALULSLFast,
                                  FeatureFuseAdrpAdd,
+                                 FeatureFuseCmpCSel,
+                                 FeatureFuseCmpCSet,
                                  FeatureEnableSelectOptimize,
                                  FeaturePredictableSelectIsExpensive]>;
 
@@ -212,6 +226,8 @@ def TuneA725 : SubtargetFeature<"cortex-a725", "ARMProcFamily",
                                 FeatureCmpBccFusion,
                                 FeatureALULSLFast,
                                 FeatureFuseAdrpAdd,
+                                FeatureFuseCmpCSel,
+                                FeatureFuseCmpCSet,
                                 FeatureEnableSelectOptimize,
                                 FeaturePredictableSelectIsExpensive]>;
 
@@ -262,6 +278,8 @@ def TuneX4 : SubtargetFeature<"cortex-x4", "ARMProcFamily", "CortexX4",
                               "Cortex-X4 ARM processors", [
                                FeatureALULSLFast,
                                FeatureFuseAdrpAdd,
+                               FeatureFuseCmpCSel,
+                               FeatureFuseCmpCSet,
                                FeatureFuseAES,
                                FeaturePostRAScheduler,
                                FeatureEnableSelectOptimize,
@@ -273,6 +291,8 @@ def TuneX925 : SubtargetFeature<"cortex-x925", "ARMProcFamily",
                                 "CortexX925", "Cortex-X925 ARM processors",[
                                 FeatureALULSLFast,
                                 FeatureFuseAdrpAdd,
+                                FeatureFuseCmpCSel,
+                                FeatureFuseCmpCSet,
                                 FeatureFuseAES,
                                 FeaturePostRAScheduler,
                                 FeatureEnableSelectOptimize,
@@ -321,7 +341,11 @@ def TuneAppleA7  : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7",
                                     FeatureFuseAES, FeatureFuseCryptoEOR,
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
-                                    FeatureZCZeroing,
+                                    FeatureZCRegMoveFPR128,
+                                    FeatureZCZeroingGPR32,
+                                    FeatureZCZeroingGPR64,
+                                    FeatureNoZCZeroingFPR64,
+                                    FeatureZCZeroingFPR128,
                                     FeatureZCZeroingFPWorkaround]>;
 
 def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
@@ -334,7 +358,11 @@ def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
                                     FeatureFuseCryptoEOR,
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
-                                    FeatureZCZeroing]>;
+                                    FeatureZCRegMoveFPR128,
+                                    FeatureZCZeroingGPR32,
+                                    FeatureZCZeroingGPR64,
+                                    FeatureNoZCZeroingFPR64,
+                                    FeatureZCZeroingFPR128]>;
 
 def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
                                     "Apple A11", [
@@ -346,7 +374,11 @@ def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
                                     FeatureFuseCryptoEOR,
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
-                                    FeatureZCZeroing]>;
+                                    FeatureZCRegMoveFPR128,
+                                    FeatureZCZeroingGPR32,
+                                    FeatureZCZeroingGPR64,
+                                    FeatureNoZCZeroingFPR64,
+                                    FeatureZCZeroingFPR128]>;
 
 def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
                                     "Apple A12", [
@@ -358,7 +390,11 @@ def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
                                     FeatureFuseCryptoEOR,
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
-                                    FeatureZCZeroing]>;
+                                    FeatureZCRegMoveFPR128,
+                                    FeatureZCZeroingGPR32,
+                                    FeatureZCZeroingGPR64,
+                                    FeatureNoZCZeroingFPR64,
+                                    FeatureZCZeroingFPR128]>;
 
 def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
                                     "Apple A13", [
@@ -370,7 +406,11 @@ def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
                                     FeatureFuseCryptoEOR,
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
-                                    FeatureZCZeroing]>;
+                                    FeatureZCRegMoveFPR128,
+                                    FeatureZCZeroingGPR32,
+                                    FeatureZCZeroingGPR64,
+                                    FeatureNoZCZeroingFPR64,
+                                    FeatureZCZeroingFPR128]>;
 
 def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
                                     "Apple A14", [
@@ -382,12 +422,16 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
                                     FeatureFuseAddress,
                                     FeatureFuseAES,
                                     FeatureFuseArithmeticLogic,
-                                    FeatureFuseCCSelect,
+                                    FeatureFuseCmpCSel,
                                     FeatureFuseCryptoEOR,
                                     FeatureFuseLiterals,
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
-                                    FeatureZCZeroing]>;
+                                    FeatureZCRegMoveFPR128,
+                                    FeatureZCZeroingGPR32,
+                                    FeatureZCZeroingGPR64,
+                                    FeatureNoZCZeroingFPR64,
+                                    FeatureZCZeroingFPR128]>;
 
 def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
                                     "Apple A15", [
@@ -399,12 +443,16 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
                                     FeatureFuseAdrpAdd,
                                     FeatureFuseAES,
                                     FeatureFuseArithmeticLogic,
-                                    FeatureFuseCCSelect,
+                                    FeatureFuseCmpCSel,
                                     FeatureFuseCryptoEOR,
                                     FeatureFuseLiterals,
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
-                                    FeatureZCZeroing]>;
+                                    FeatureZCRegMoveFPR128,
+                                    FeatureZCZeroingGPR32,
+                                    FeatureZCZeroingGPR64,
+                                    FeatureNoZCZeroingFPR64,
+                                    FeatureZCZeroingFPR128]>;
 
 def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
                                     "Apple A16", [
@@ -416,12 +464,16 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
                                     FeatureFuseAdrpAdd,
                                     FeatureFuseAES,
                                     FeatureFuseArithmeticLogic,
-                                    FeatureFuseCCSelect,
+                                    FeatureFuseCmpCSel,
                                     FeatureFuseCryptoEOR,
                                     FeatureFuseLiterals,
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
-                                    FeatureZCZeroing]>;
+                                    FeatureZCRegMoveFPR128,
+                                    FeatureZCZeroingGPR32,
+                                    FeatureZCZeroingGPR64,
+                                    FeatureNoZCZeroingFPR64,
+                                    FeatureZCZeroingFPR128]>;
 
 def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
                                     "Apple A17", [
@@ -433,12 +485,16 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
                                     FeatureFuseAdrpAdd,
                                     FeatureFuseAES,
                                     FeatureFuseArithmeticLogic,
-                                    FeatureFuseCCSelect,
+                                    FeatureFuseCmpCSel,
                                     FeatureFuseCryptoEOR,
                                     FeatureFuseLiterals,
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
-                                    FeatureZCZeroing]>;
+                                    FeatureZCRegMoveFPR128,
+                                    FeatureZCZeroingGPR32,
+                                    FeatureZCZeroingGPR64,
+                                    FeatureNoZCZeroingFPR64,
+                                    FeatureZCZeroingFPR128]>;
 
 def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
                                      "Apple M4", [
@@ -450,12 +506,15 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
                                      FeatureFuseAdrpAdd,
                                      FeatureFuseAES,
                                      FeatureFuseArithmeticLogic,
-                                     FeatureFuseCCSelect,
+                                     FeatureFuseCmpCSel,
                                      FeatureFuseCryptoEOR,
                                      FeatureFuseLiterals,
                                      FeatureZCRegMoveGPR64,
-                                     FeatureZCZeroing
-                                     ]>;
+                                     FeatureZCRegMoveFPR128,
+                                     FeatureZCZeroingGPR32,
+                                     FeatureZCZeroingGPR64,
+                                     FeatureNoZCZeroingFPR64,
+                                     FeatureZCZeroingFPR128]>;
 
 def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
                                     "Samsung Exynos-M3 processors",
@@ -463,7 +522,7 @@ def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
                                      FeatureForce32BitJumpTables,
                                      FeatureFuseAddress,
                                      FeatureFuseAES,
-                                     FeatureFuseCCSelect,
+                                     FeatureFuseCmpCSel,
                                      FeatureFuseAdrpAdd,
                                      FeatureFuseLiterals,
                                      FeatureStorePairSuppress,
@@ -481,19 +540,21 @@ def TuneExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3",
                                      FeatureFuseAddress,
                                      FeatureFuseAES,
                                      FeatureFuseArithmeticLogic,
-                                     FeatureFuseCCSelect,
+                                     FeatureFuseCmpCSel,
                                      FeatureFuseAdrpAdd,
                                      FeatureFuseLiterals,
                                      FeatureStorePairSuppress,
                                      FeatureALULSLFast,
                                      FeaturePostRAScheduler,
-                                     FeatureZCZeroing]>;
+                                     FeatureZCZeroingGPR32,
+                                     FeatureZCZeroingGPR64]>;
 
 def TuneKryo    : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
                                    "Qualcomm Kryo processors", [
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive,
-                                   FeatureZCZeroing,
+                                   FeatureZCZeroingGPR32,
+                                   FeatureZCZeroingGPR64,
                                    FeatureALULSLFast,
                                    FeatureStorePairSuppress]>;
 
@@ -501,7 +562,8 @@ def TuneFalkor  : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
                                    "Qualcomm Falkor processors", [
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive,
-                                   FeatureZCZeroing,
+                                   FeatureZCZeroingGPR32,
+                                   FeatureZCZeroingGPR64,
                                    FeatureStorePairSuppress,
                                    FeatureALULSLFast,
                                    FeatureSlowSTRQro]>;
@@ -526,6 +588,8 @@ def TuneNeoverseN2 : SubtargetFeature<"neoversen2", "ARMProcFamily", "NeoverseN2
                                       "Neoverse N2 ARM processors", [
                                       FeatureFuseAES,
                                       FeatureFuseAdrpAdd,
+                                      FeatureFuseCmpCSel,
+                                      FeatureFuseCmpCSet,
                                       FeatureALULSLFast,
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
@@ -537,6 +601,8 @@ def TuneNeoverseN3 : SubtargetFeature<"neoversen3", "ARMProcFamily", "NeoverseN3
                                       FeaturePostRAScheduler,
                                       FeatureALULSLFast,
                                       FeatureFuseAdrpAdd,
+                                      FeatureFuseCmpCSel,
+                                      FeatureFuseCmpCSet,
                                       FeatureEnableSelectOptimize,
                                       FeaturePredictableSelectIsExpensive]>;
 
@@ -553,6 +619,8 @@ def TuneNeoverseV1 : SubtargetFeature<"neoversev1", "ARMProcFamily", "NeoverseV1
                                       "Neoverse V1 ARM processors", [
                                       FeatureFuseAES,
                                       FeatureFuseAdrpAdd,
+                                      FeatureFuseCmpCSel,
+                                      FeatureFuseCmpCSet,
                                       FeatureAddrLSLSlow14,
                                       FeatureALULSLFast,
                                       FeaturePostRAScheduler,
@@ -565,6 +633,8 @@ def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2
                                       FeatureFuseAES,
                                       FeatureCmpBccFusion,
                                       FeatureFuseAdrpAdd,
+                                      FeatureFuseCmpCSel,
+                                      FeatureFuseCmpCSet,
                                       FeatureALULSLFast,
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
@@ -578,6 +648,8 @@ def TuneNeoverseV3 : SubtargetFeature<"neoversev3", "ARMProcFamily", "NeoverseV3
                                       FeatureFuseAES,
                                       FeatureALULSLFast,
                                       FeatureFuseAdrpAdd,
+                                      FeatureFuseCmpCSel,
+                                      FeatureFuseCmpCSet,
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
                                       FeatureAvoidLDAPUR,
@@ -588,6 +660,8 @@ def TuneNeoverseV3AE : SubtargetFeature<"neoversev3AE", "ARMProcFamily", "Neover
                                       FeatureFuseAES,
                                       FeatureALULSLFast,
                                       FeatureFuseAdrpAdd,
+                                      FeatureFuseCmpCSel,
+                                      FeatureFuseCmpCSet,
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
                                       FeatureAvoidLDAPUR,
@@ -597,7 +671,8 @@ def TuneSaphira  : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
                                    "Qualcomm Saphira processors", [
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive,
-                                   FeatureZCZeroing,
+                                   FeatureZCZeroingGPR32,
+                                   FeatureZCZeroingGPR64,
                                    FeatureStorePairSuppress,
                                    FeatureALULSLFast]>;
 
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
new file mode 100644
index 000000000000..af424987b8dd
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
@@ -0,0 +1,794 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64PrologueEpilogue.h"
+#include "AArch64FrameLowering.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/CodeGen/CFIInstBuilder.h"
+#include "llvm/MC/MCContext.h"
+
+#define DEBUG_TYPE "frame-info"
+
+STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
+
+namespace llvm {
+
+AArch64PrologueEmitter::AArch64PrologueEmitter(MachineFunction &MF,
+                                               MachineBasicBlock &MBB,
+                                               const AArch64FrameLowering &AFL)
+    : MF(MF), MBB(MBB), F(MF.getFunction()), MFI(MF.getFrameInfo()),
+      Subtarget(MF.getSubtarget<AArch64Subtarget>()), AFL(AFL),
+      RegInfo(*Subtarget.getRegisterInfo()) {
+  TII = Subtarget.getInstrInfo();
+  AFI = MF.getInfo<AArch64FunctionInfo>();
+
+  EmitCFI = AFI->needsDwarfUnwindInfo(MF);
+  EmitAsyncCFI = AFI->needsAsyncDwarfUnwindInfo(MF);
+  HasFP = AFL.hasFP(MF);
+  NeedsWinCFI = AFL.needsWinCFI(MF);
+  IsFunclet = MBB.isEHFuncletEntry();
+  HomPrologEpilog = AFL.homogeneousPrologEpilog(MF);
+
+#ifndef NDEBUG
+  collectBlockLiveins();
+#endif
+}
+
+#ifndef NDEBUG
+/// Collect live registers from the end of \p MI's parent up to (including) \p
+/// MI in \p LiveRegs.
+static void getLivePhysRegsUpTo(MachineInstr &MI, const TargetRegisterInfo &TRI,
+                                LivePhysRegs &LiveRegs) {
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  LiveRegs.addLiveOuts(MBB);
+  for (const MachineInstr &MI :
+       reverse(make_range(MI.getIterator(), MBB.instr_end())))
+    LiveRegs.stepBackward(MI);
+}
+
+void AArch64PrologueEmitter::collectBlockLiveins() {
+  // Collect live register from the end of MBB up to the start of the existing
+  // frame setup instructions.
+  PrologueEndI = MBB.begin();
+  while (PrologueEndI != MBB.end() &&
+         PrologueEndI->getFlag(MachineInstr::FrameSetup))
+    ++PrologueEndI;
+
+  if (PrologueEndI != MBB.end()) {
+    getLivePhysRegsUpTo(*PrologueEndI, RegInfo, LiveRegs);
+    // Ignore registers used for stack management for now.
+    LiveRegs.removeReg(AArch64::SP);
+    LiveRegs.removeReg(AArch64::X19);
+    LiveRegs.removeReg(AArch64::FP);
+    LiveRegs.removeReg(AArch64::LR);
+
+    // X0 will be clobbered by a call to __arm_get_current_vg in the prologue.
+    // This is necessary to spill VG if required where SVE is unavailable, but
+    // X0 is preserved around this call.
+    if (AFL.requiresGetVGCall(MF))
+      LiveRegs.removeReg(AArch64::X0);
+  }
+}
+
+void AArch64PrologueEmitter::verifyPrologueClobbers() const {
+  if (PrologueEndI == MBB.end())
+    return;
+  // Check if any of the newly instructions clobber any of the live registers.
+  for (MachineInstr &MI :
+       make_range(MBB.instr_begin(), PrologueEndI->getIterator())) {
+    for (auto &Op : MI.operands())
+      if (Op.isReg() && Op.isDef())
+        assert(!LiveRegs.contains(Op.getReg()) &&
+               "live register clobbered by inserted prologue instructions");
+  }
+}
+#endif
+
+void AArch64PrologueEmitter::determineLocalsStackSize(
+    uint64_t StackSize, uint64_t PrologueSaveSize) {
+  AFI->setLocalStackSize(StackSize - PrologueSaveSize);
+  CombineSPBump = AFL.shouldCombineCSRLocalStackBump(MF, StackSize);
+}
+
+void AArch64PrologueEmitter::emitPrologue() {
+  const MachineBasicBlock::iterator PrologueBeginI = MBB.begin();
+  const MachineBasicBlock::iterator EndI = MBB.end();
+
+  // At this point, we're going to decide whether or not the function uses a
+  // redzone. In most cases, the function doesn't have a redzone so let's
+  // assume that's false and set it to true in the case that there's a redzone.
+  AFI->setHasRedZone(false);
+
+  // Debug location must be unknown since the first debug location is used
+  // to determine the end of the prologue.
+  DebugLoc DL;
+
+  if (AFI->shouldSignReturnAddress(MF)) {
+    // If pac-ret+leaf is in effect, PAUTH_PROLOGUE pseudo instructions
+    // are inserted by emitPacRetPlusLeafHardening().
+    if (!AFL.shouldSignReturnAddressEverywhere(MF)) {
+      BuildMI(MBB, PrologueBeginI, DL, TII->get(AArch64::PAUTH_PROLOGUE))
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
+    // AArch64PointerAuth pass will insert SEH_PACSignLR
+    HasWinCFI |= NeedsWinCFI;
+  }
+
+  if (AFI->needsShadowCallStackPrologueEpilogue(MF)) {
+    emitShadowCallStackPrologue(PrologueBeginI, DL);
+    HasWinCFI |= NeedsWinCFI;
+  }
+
+  if (EmitCFI && AFI->isMTETagged())
+    BuildMI(MBB, PrologueBeginI, DL, TII->get(AArch64::EMITMTETAGGED))
+        .setMIFlag(MachineInstr::FrameSetup);
+
+  // We signal the presence of a Swift extended frame to external tools by
+  // storing FP with 0b0001 in bits 63:60. In normal userland operation a simple
+  // ORR is sufficient, it is assumed a Swift kernel would initialize the TBI
+  // bits so that is still true.
+  if (HasFP && AFI->hasSwiftAsyncContext())
+    emitSwiftAsyncContextFramePointer(PrologueBeginI, DL);
+
+  // All calls are tail calls in GHC calling conv, and functions have no
+  // prologue/epilogue.
+  if (MF.getFunction().getCallingConv() == CallingConv::GHC)
+    return;
+
+  // Set tagged base pointer to the requested stack slot. Ideally it should
+  // match SP value after prologue.
+  if (std::optional<int> TBPI = AFI->getTaggedBasePointerIndex())
+    AFI->setTaggedBasePointerOffset(-MFI.getObjectOffset(*TBPI));
+  else
+    AFI->setTaggedBasePointerOffset(MFI.getStackSize());
+
+  // getStackSize() includes all the locals in its size calculation. We don't
+  // include these locals when computing the stack size of a funclet, as they
+  // are allocated in the parent's stack frame and accessed via the frame
+  // pointer from the funclet.  We only save the callee saved registers in the
+  // funclet, which are really the callee saved registers of the parent
+  // function, including the funclet.
+  int64_t NumBytes =
+      IsFunclet ? AFL.getWinEHFuncletFrameSize(MF) : MFI.getStackSize();
+  if (!AFI->hasStackFrame() && !AFL.windowsRequiresStackProbe(MF, NumBytes))
+    return emitEmptyStackFramePrologue(NumBytes, PrologueBeginI, DL);
+
+  bool IsWin64 = Subtarget.isCallingConvWin64(F.getCallingConv(), F.isVarArg());
+  unsigned FixedObject = AFL.getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
+
+  // Windows unwind can't represent the required stack adjustments if we have
+  // both SVE callee-saves and dynamic stack allocations, and the frame
+  // pointer is before the SVE spills.  The allocation of the frame pointer
+  // must be the last instruction in the prologue so the unwinder can restore
+  // the stack pointer correctly. (And there isn't any unwind opcode for
+  // `addvl sp, x29, -17`.)
+  //
+  // Because of this, we do spills in the opposite order on Windows: first SVE,
+  // then GPRs. The main side-effect of this is that it makes accessing
+  // parameters passed on the stack more expensive.
+  //
+  // We could consider rearranging the spills for simpler cases.
+  bool FPAfterSVECalleeSaves =
+      Subtarget.isTargetWindows() && AFI->getSVECalleeSavedStackSize();
+
+  if (FPAfterSVECalleeSaves && AFI->hasStackHazardSlotIndex())
+    reportFatalUsageError("SME hazard padding is not supported on Windows");
+
+  auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
+  // All of the remaining stack allocations are for locals.
+  determineLocalsStackSize(NumBytes, PrologueSaveSize);
+
+  MachineBasicBlock::iterator FirstGPRSaveI = PrologueBeginI;
+  if (FPAfterSVECalleeSaves) {
+    // If we're doing SVE saves first, we need to immediately allocate space
+    // for fixed objects, then space for the SVE callee saves.
+    //
+    // Windows unwind requires that the scalable size is a multiple of 16;
+    // that's handled when the callee-saved size is computed.
+    auto SaveSize =
+        StackOffset::getScalable(AFI->getSVECalleeSavedStackSize()) +
+        StackOffset::getFixed(FixedObject);
+    AFL.allocateStackSpace(MBB, PrologueBeginI, 0, SaveSize, NeedsWinCFI,
+                           &HasWinCFI,
+                           /*EmitCFI=*/false, StackOffset{},
+                           /*FollowupAllocs=*/true);
+    NumBytes -= FixedObject;
+
+    // Now allocate space for the GPR callee saves.
+    MachineBasicBlock::iterator MBBI = PrologueBeginI;
+    while (MBBI != EndI && AFL.isSVECalleeSave(MBBI))
+      ++MBBI;
+    FirstGPRSaveI = AFL.convertCalleeSaveRestoreToSPPrePostIncDec(
+        MBB, MBBI, DL, TII, -AFI->getCalleeSavedStackSize(), NeedsWinCFI,
+        &HasWinCFI, EmitAsyncCFI);
+    NumBytes -= AFI->getCalleeSavedStackSize();
+  } else if (CombineSPBump) {
+    assert(!AFL.getSVEStackSize(MF) && "Cannot combine SP bump with SVE");
+    emitFrameOffset(MBB, PrologueBeginI, DL, AArch64::SP, AArch64::SP,
+                    StackOffset::getFixed(-NumBytes), TII,
+                    MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI,
+                    EmitAsyncCFI);
+    NumBytes = 0;
+  } else if (HomPrologEpilog) {
+    // Stack has been already adjusted.
+    NumBytes -= PrologueSaveSize;
+  } else if (PrologueSaveSize != 0) {
+    FirstGPRSaveI = AFL.convertCalleeSaveRestoreToSPPrePostIncDec(
+        MBB, PrologueBeginI, DL, TII, -PrologueSaveSize, NeedsWinCFI,
+        &HasWinCFI, EmitAsyncCFI);
+    NumBytes -= PrologueSaveSize;
+  }
+  assert(NumBytes >= 0 && "Negative stack allocation size!?");
+
+  // Move past the saves of the callee-saved registers, fixing up the offsets
+  // and pre-inc if we decided to combine the callee-save and local stack
+  // pointer bump above.
+  auto &TLI = *MF.getSubtarget().getTargetLowering();
+
+  MachineBasicBlock::iterator AfterGPRSavesI = FirstGPRSaveI;
+  while (AfterGPRSavesI != EndI &&
+         AfterGPRSavesI->getFlag(MachineInstr::FrameSetup) &&
+         !AFL.isSVECalleeSave(AfterGPRSavesI)) {
+    if (CombineSPBump &&
+        // Only fix-up frame-setup load/store instructions.
+        (!AFL.requiresSaveVG(MF) || !AFL.isVGInstruction(AfterGPRSavesI, TLI)))
+      AFL.fixupCalleeSaveRestoreStackOffset(
+          *AfterGPRSavesI, AFI->getLocalStackSize(), NeedsWinCFI, &HasWinCFI);
+    ++AfterGPRSavesI;
+  }
+
+  // For funclets the FP belongs to the containing function. Only set up FP if
+  // we actually need to.
+  if (!IsFunclet && HasFP)
+    emitFramePointerSetup(AfterGPRSavesI, DL, FixedObject);
+
+  // Now emit the moves for whatever callee saved regs we have (including FP,
+  // LR if those are saved). Frame instructions for SVE register are emitted
+  // later, after the instruction which actually save SVE regs.
+  if (EmitAsyncCFI)
+    emitCalleeSavedGPRLocations(AfterGPRSavesI);
+
+  // Alignment is required for the parent frame, not the funclet
+  const bool NeedsRealignment =
+      NumBytes && !IsFunclet && RegInfo.hasStackRealignment(MF);
+  const int64_t RealignmentPadding =
+      (NeedsRealignment && MFI.getMaxAlign() > Align(16))
+          ? MFI.getMaxAlign().value() - 16
+          : 0;
+
+  if (AFL.windowsRequiresStackProbe(MF, NumBytes + RealignmentPadding))
+    emitWindowsStackProbe(AfterGPRSavesI, DL, NumBytes, RealignmentPadding);
+
+  StackOffset SVEStackSize = AFL.getSVEStackSize(MF);
+  StackOffset SVECalleeSavesSize = {}, SVELocalsSize = SVEStackSize;
+  MachineBasicBlock::iterator CalleeSavesEnd = AfterGPRSavesI;
+
+  StackOffset CFAOffset =
+      StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes);
+
+  // Process the SVE callee-saves to determine what space needs to be
+  // allocated.
+  MachineBasicBlock::iterator AfterSVESavesI = AfterGPRSavesI;
+  if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
+    LLVM_DEBUG(dbgs() << "SVECalleeSavedStackSize = " << CalleeSavedSize
+                      << "\n");
+    SVECalleeSavesSize = StackOffset::getScalable(CalleeSavedSize);
+    SVELocalsSize = SVEStackSize - SVECalleeSavesSize;
+    // Find callee save instructions in frame.
+    // Note: With FPAfterSVECalleeSaves the callee saves have already been
+    // allocated.
+    if (!FPAfterSVECalleeSaves) {
+      MachineBasicBlock::iterator CalleeSavesBegin = AfterGPRSavesI;
+      assert(AFL.isSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction");
+      while (AFL.isSVECalleeSave(AfterSVESavesI) &&
+             AfterSVESavesI != MBB.getFirstTerminator())
+        ++AfterSVESavesI;
+      CalleeSavesEnd = AfterSVESavesI;
+
+      StackOffset LocalsSize = SVELocalsSize + StackOffset::getFixed(NumBytes);
+      // Allocate space for the callee saves (if any).
+      AFL.allocateStackSpace(MBB, CalleeSavesBegin, 0, SVECalleeSavesSize,
+                             false, nullptr, EmitAsyncCFI && !HasFP, CFAOffset,
+                             MFI.hasVarSizedObjects() || LocalsSize);
+    }
+  }
+  CFAOffset += SVECalleeSavesSize;
+
+  if (EmitAsyncCFI)
+    emitCalleeSavedSVELocations(CalleeSavesEnd);
+
+  // Allocate space for the rest of the frame including SVE locals. Align the
+  // stack as necessary.
+  assert(!(AFL.canUseRedZone(MF) && NeedsRealignment) &&
+         "Cannot use redzone with stack realignment");
+  if (!AFL.canUseRedZone(MF)) {
+    // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
+    // the correct value here, as NumBytes also includes padding bytes,
+    // which shouldn't be counted here.
+    AFL.allocateStackSpace(MBB, CalleeSavesEnd, RealignmentPadding,
+                           SVELocalsSize + StackOffset::getFixed(NumBytes),
+                           NeedsWinCFI, &HasWinCFI, EmitAsyncCFI && !HasFP,
+                           CFAOffset, MFI.hasVarSizedObjects());
+  }
+
+  // If we need a base pointer, set it up here. It's whatever the value of the
+  // stack pointer is at this point. Any variable size objects will be allocated
+  // after this, so we can still use the base pointer to reference locals.
+  //
+  // FIXME: Clarify FrameSetup flags here.
+  // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
+  // needed.
+  // For funclets the BP belongs to the containing function.
+  if (!IsFunclet && RegInfo.hasBasePointer(MF)) {
+    TII->copyPhysReg(MBB, AfterSVESavesI, DL, RegInfo.getBaseRegister(),
+                     AArch64::SP, false);
+    if (NeedsWinCFI) {
+      HasWinCFI = true;
+      BuildMI(MBB, AfterSVESavesI, DL, TII->get(AArch64::SEH_Nop))
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
+  }
+
+  // The very last FrameSetup instruction indicates the end of prologue. Emit a
+  // SEH opcode indicating the prologue end.
+  if (NeedsWinCFI && HasWinCFI) {
+    BuildMI(MBB, AfterSVESavesI, DL, TII->get(AArch64::SEH_PrologEnd))
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  // SEH funclets are passed the frame pointer in X1.  If the parent
+  // function uses the base register, then the base register is used
+  // directly, and is not retrieved from X1.
+  if (IsFunclet && F.hasPersonalityFn()) {
+    EHPersonality Per = classifyEHPersonality(F.getPersonalityFn());
+    if (isAsynchronousEHPersonality(Per)) {
+      BuildMI(MBB, AfterSVESavesI, DL, TII->get(TargetOpcode::COPY),
+              AArch64::FP)
+          .addReg(AArch64::X1)
+          .setMIFlag(MachineInstr::FrameSetup);
+      MBB.addLiveIn(AArch64::X1);
+    }
+  }
+
+  if (EmitCFI && !EmitAsyncCFI) {
+    if (HasFP) {
+      emitDefineCFAWithFP(AfterSVESavesI, FixedObject);
+    } else {
+      StackOffset TotalSize =
+          SVEStackSize + StackOffset::getFixed((int64_t)MFI.getStackSize());
+      CFIInstBuilder CFIBuilder(MBB, AfterSVESavesI, MachineInstr::FrameSetup);
+      CFIBuilder.insertCFIInst(
+          createDefCFA(RegInfo, /*FrameReg=*/AArch64::SP, /*Reg=*/AArch64::SP,
+                       TotalSize, /*LastAdjustmentWasScalable=*/false));
+    }
+    emitCalleeSavedGPRLocations(AfterSVESavesI);
+    emitCalleeSavedSVELocations(AfterSVESavesI);
+  }
+}
+
+void AArch64PrologueEmitter::emitShadowCallStackPrologue(
+    MachineBasicBlock::iterator MBBI, const DebugLoc &DL) const {
+  // Shadow call stack prolog: str x30, [x18], #8
+  BuildMI(MBB, MBBI, DL, TII->get(AArch64::STRXpost))
+      .addReg(AArch64::X18, RegState::Define)
+      .addReg(AArch64::LR)
+      .addReg(AArch64::X18)
+      .addImm(8)
+      .setMIFlag(MachineInstr::FrameSetup);
+
+  // This instruction also makes x18 live-in to the entry block.
+  MBB.addLiveIn(AArch64::X18);
+
+  if (NeedsWinCFI)
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
+        .setMIFlag(MachineInstr::FrameSetup);
+
+  if (EmitCFI) {
+    // Emit a CFI instruction that causes 8 to be subtracted from the value of
+    // x18 when unwinding past this frame.
+    static const char CFIInst[] = {
+        dwarf::DW_CFA_val_expression,
+        18, // register
+        2,  // length
+        static_cast<char>(unsigned(dwarf::DW_OP_breg18)),
+        static_cast<char>(-8) & 0x7f, // addend (sleb128)
+    };
+    CFIInstBuilder(MBB, MBBI, MachineInstr::FrameSetup)
+        .buildEscape(StringRef(CFIInst, sizeof(CFIInst)));
+  }
+}
+
+void AArch64PrologueEmitter::emitSwiftAsyncContextFramePointer(
+    MachineBasicBlock::iterator MBBI, const DebugLoc &DL) const {
+  switch (MF.getTarget().Options.SwiftAsyncFramePointer) {
+  case SwiftAsyncFramePointerMode::DeploymentBased:
+    if (Subtarget.swiftAsyncContextIsDynamicallySet()) {
+      // The special symbol below is absolute and has a *value* that can be
+      // combined with the frame pointer to signal an extended frame.
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::LOADgot), AArch64::X16)
+          .addExternalSymbol("swift_async_extendedFramePointerFlags",
+                             AArch64II::MO_GOT);
+      if (NeedsWinCFI) {
+        BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
+            .setMIFlags(MachineInstr::FrameSetup);
+        HasWinCFI = true;
+      }
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrs), AArch64::FP)
+          .addUse(AArch64::FP)
+          .addUse(AArch64::X16)
+          .addImm(Subtarget.isTargetILP32() ? 32 : 0);
+      if (NeedsWinCFI) {
+        BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
+            .setMIFlags(MachineInstr::FrameSetup);
+        HasWinCFI = true;
+      }
+      break;
+    }
+    [[fallthrough]];
+
+  case SwiftAsyncFramePointerMode::Always:
+    // ORR x29, x29, #0x1000_0000_0000_0000
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXri), AArch64::FP)
+        .addUse(AArch64::FP)
+        .addImm(0x1100)
+        .setMIFlag(MachineInstr::FrameSetup);
+    if (NeedsWinCFI) {
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
+          .setMIFlags(MachineInstr::FrameSetup);
+      HasWinCFI = true;
+    }
+    break;
+
+  case SwiftAsyncFramePointerMode::Never:
+    break;
+  }
+}
+
+void AArch64PrologueEmitter::emitEmptyStackFramePrologue(
+    int64_t NumBytes, MachineBasicBlock::iterator MBBI,
+    const DebugLoc &DL) const {
+  assert(!HasFP && "unexpected function without stack frame but with FP");
+  assert(!AFL.getSVEStackSize(MF) &&
+         "unexpected function without stack frame but with SVE objects");
+  // All of the stack allocation is for locals.
+  AFI->setLocalStackSize(NumBytes);
+  if (!NumBytes) {
+    if (NeedsWinCFI && HasWinCFI) {
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
+    return;
+  }
+  // REDZONE: If the stack size is less than 128 bytes, we don't need
+  // to actually allocate.
+  if (AFL.canUseRedZone(MF)) {
+    AFI->setHasRedZone(true);
+    ++NumRedZoneFunctions;
+  } else {
+    emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
+                    StackOffset::getFixed(-NumBytes), TII,
+                    MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
+    if (EmitCFI) {
+      // Label used to tie together the PROLOG_LABEL and the MachineMoves.
+      MCSymbol *FrameLabel = MF.getContext().createTempSymbol();
+      // Encode the stack size of the leaf function.
+      CFIInstBuilder(MBB, MBBI, MachineInstr::FrameSetup)
+          .buildDefCFAOffset(NumBytes, FrameLabel);
+    }
+  }
+
+  if (NeedsWinCFI) {
+    HasWinCFI = true;
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+}
+
+void AArch64PrologueEmitter::emitFramePointerSetup(
+    MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+    unsigned FixedObject) {
+  int64_t FPOffset = AFI->getCalleeSaveBaseToFrameRecordOffset();
+  if (CombineSPBump)
+    FPOffset += AFI->getLocalStackSize();
+
+  if (AFI->hasSwiftAsyncContext()) {
+    // Before we update the live FP we have to ensure there's a valid (or
+    // null) asynchronous context in its slot just before FP in the frame
+    // record, so store it now.
+    const auto &Attrs = MF.getFunction().getAttributes();
+    bool HaveInitialContext = Attrs.hasAttrSomewhere(Attribute::SwiftAsync);
+    if (HaveInitialContext)
+      MBB.addLiveIn(AArch64::X22);
+    Register Reg = HaveInitialContext ? AArch64::X22 : AArch64::XZR;
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::StoreSwiftAsyncContext))
+        .addUse(Reg)
+        .addUse(AArch64::SP)
+        .addImm(FPOffset - 8)
+        .setMIFlags(MachineInstr::FrameSetup);
+    if (NeedsWinCFI) {
+      // WinCFI and arm64e, where StoreSwiftAsyncContext is expanded
+      // to multiple instructions, should be mutually-exclusive.
+      assert(Subtarget.getTargetTriple().getArchName() != "arm64e");
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
+          .setMIFlags(MachineInstr::FrameSetup);
+      HasWinCFI = true;
+    }
+  }
+
+  if (HomPrologEpilog) {
+    auto Prolog = MBBI;
+    --Prolog;
+    assert(Prolog->getOpcode() == AArch64::HOM_Prolog);
+    Prolog->addOperand(MachineOperand::CreateImm(FPOffset));
+  } else {
+    // Issue    sub fp, sp, FPOffset or
+    //          mov fp,sp          when FPOffset is zero.
+    // Note: All stores of callee-saved registers are marked as "FrameSetup".
+    // This code marks the instruction(s) that set the FP also.
+    emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP,
+                    StackOffset::getFixed(FPOffset), TII,
+                    MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
+    if (NeedsWinCFI && HasWinCFI) {
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
+          .setMIFlag(MachineInstr::FrameSetup);
+      // After setting up the FP, the rest of the prolog doesn't need to be
+      // included in the SEH unwind info.
+      NeedsWinCFI = false;
+    }
+  }
+  if (EmitAsyncCFI)
+    emitDefineCFAWithFP(MBBI, FixedObject);
+}
+
+// Define the current CFA rule to use the provided FP.
+void AArch64PrologueEmitter::emitDefineCFAWithFP(
+    MachineBasicBlock::iterator MBBI, unsigned FixedObject) const {
+  const AArch64RegisterInfo *TRI = Subtarget.getRegisterInfo();
+  const int OffsetToFirstCalleeSaveFromFP =
+      AFI->getCalleeSaveBaseToFrameRecordOffset() -
+      AFI->getCalleeSavedStackSize();
+  Register FramePtr = TRI->getFrameRegister(MF);
+  CFIInstBuilder(MBB, MBBI, MachineInstr::FrameSetup)
+      .buildDefCFA(FramePtr, FixedObject - OffsetToFirstCalleeSaveFromFP);
+}
+
+void AArch64PrologueEmitter::emitWindowsStackProbe(
+    MachineBasicBlock::iterator MBBI, const DebugLoc &DL, int64_t &NumBytes,
+    int64_t RealignmentPadding) const {
+  if (AFI->getSVECalleeSavedStackSize())
+    report_fatal_error("SVE callee saves not yet supported with stack probing");
+
+  // Find an available register to spill the value of X15 to, if X15 is being
+  // used already for nest.
+  unsigned X15Scratch = AArch64::NoRegister;
+  const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
+  if (llvm::any_of(MBB.liveins(),
+                   [&STI](const MachineBasicBlock::RegisterMaskPair &LiveIn) {
+                     return STI.getRegisterInfo()->isSuperOrSubRegisterEq(
+                         AArch64::X15, LiveIn.PhysReg);
+                   })) {
+    X15Scratch = AFL.findScratchNonCalleeSaveRegister(&MBB, /*HasCall=*/true);
+    assert(X15Scratch != AArch64::NoRegister &&
+           (X15Scratch < AArch64::X15 || X15Scratch > AArch64::X17));
+#ifndef NDEBUG
+    LiveRegs.removeReg(AArch64::X15); // ignore X15 since we restore it
+#endif
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrr), X15Scratch)
+        .addReg(AArch64::XZR)
+        .addReg(AArch64::X15, RegState::Undef)
+        .addReg(AArch64::X15, RegState::Implicit)
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  uint64_t NumWords = (NumBytes + RealignmentPadding) >> 4;
+  if (NeedsWinCFI) {
+    HasWinCFI = true;
+    // alloc_l can hold at most 256MB, so assume that NumBytes doesn't
+    // exceed this amount.  We need to move at most 2^24 - 1 into x15.
+    // This is at most two instructions, MOVZ followed by MOVK.
+    // TODO: Fix to use multiple stack alloc unwind codes for stacks
+    // exceeding 256MB in size.
+    if (NumBytes >= (1 << 28))
+      report_fatal_error("Stack size cannot exceed 256MB for stack "
+                         "unwinding purposes");
+
+    uint32_t LowNumWords = NumWords & 0xFFFF;
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVZXi), AArch64::X15)
+        .addImm(LowNumWords)
+        .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
+        .setMIFlag(MachineInstr::FrameSetup);
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
+        .setMIFlag(MachineInstr::FrameSetup);
+    if ((NumWords & 0xFFFF0000) != 0) {
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVKXi), AArch64::X15)
+          .addReg(AArch64::X15)
+          .addImm((NumWords & 0xFFFF0000) >> 16) // High half
+          .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16))
+          .setMIFlag(MachineInstr::FrameSetup);
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
+  } else {
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), AArch64::X15)
+        .addImm(NumWords)
+        .setMIFlags(MachineInstr::FrameSetup);
+  }
+
+  const char *ChkStk = Subtarget.getChkStkName();
+  switch (MF.getTarget().getCodeModel()) {
+  case CodeModel::Tiny:
+  case CodeModel::Small:
+  case CodeModel::Medium:
+  case CodeModel::Kernel:
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
+        .addExternalSymbol(ChkStk)
+        .addReg(AArch64::X15, RegState::Implicit)
+        .addReg(AArch64::X16,
+                RegState::Implicit | RegState::Define | RegState::Dead)
+        .addReg(AArch64::X17,
+                RegState::Implicit | RegState::Define | RegState::Dead)
+        .addReg(AArch64::NZCV,
+                RegState::Implicit | RegState::Define | RegState::Dead)
+        .setMIFlags(MachineInstr::FrameSetup);
+    if (NeedsWinCFI) {
+      HasWinCFI = true;
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
+    break;
+  case CodeModel::Large:
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVaddrEXT))
+        .addReg(AArch64::X16, RegState::Define)
+        .addExternalSymbol(ChkStk)
+        .addExternalSymbol(ChkStk)
+        .setMIFlags(MachineInstr::FrameSetup);
+    if (NeedsWinCFI) {
+      HasWinCFI = true;
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
+
+    BuildMI(MBB, MBBI, DL, TII->get(getBLRCallOpcode(MF)))
+        .addReg(AArch64::X16, RegState::Kill)
+        .addReg(AArch64::X15, RegState::Implicit | RegState::Define)
+        .addReg(AArch64::X16,
+                RegState::Implicit | RegState::Define | RegState::Dead)
+        .addReg(AArch64::X17,
+                RegState::Implicit | RegState::Define | RegState::Dead)
+        .addReg(AArch64::NZCV,
+                RegState::Implicit | RegState::Define | RegState::Dead)
+        .setMIFlags(MachineInstr::FrameSetup);
+    if (NeedsWinCFI) {
+      HasWinCFI = true;
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
+    break;
+  }
+
+  BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP)
+      .addReg(AArch64::SP, RegState::Kill)
+      .addReg(AArch64::X15, RegState::Kill)
+      .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 4))
+      .setMIFlags(MachineInstr::FrameSetup);
+  if (NeedsWinCFI) {
+    HasWinCFI = true;
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
+        .addImm(NumBytes)
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+  NumBytes = 0;
+
+  if (RealignmentPadding > 0) {
+    if (RealignmentPadding >= 4096) {
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm))
+          .addReg(AArch64::X16, RegState::Define)
+          .addImm(RealignmentPadding)
+          .setMIFlags(MachineInstr::FrameSetup);
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXrx64), AArch64::X15)
+          .addReg(AArch64::SP)
+          .addReg(AArch64::X16, RegState::Kill)
+          .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0))
+          .setMIFlag(MachineInstr::FrameSetup);
+    } else {
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), AArch64::X15)
+          .addReg(AArch64::SP)
+          .addImm(RealignmentPadding)
+          .addImm(0)
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
+
+    uint64_t AndMask = ~(MFI.getMaxAlign().value() - 1);
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
+        .addReg(AArch64::X15, RegState::Kill)
+        .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64));
+    AFI->setStackRealigned(true);
+
+    // No need for SEH instructions here; if we're realigning the stack,
+    // we've set a frame pointer and already finished the SEH prologue.
+    assert(!NeedsWinCFI);
+  }
+  if (X15Scratch != AArch64::NoRegister) {
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrr), AArch64::X15)
+        .addReg(AArch64::XZR)
+        .addReg(X15Scratch, RegState::Undef)
+        .addReg(X15Scratch, RegState::Implicit)
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+}
+
+void AArch64PrologueEmitter::emitCalleeSavedGPRLocations(
+    MachineBasicBlock::iterator MBBI) const {
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+  if (CSI.empty())
+    return;
+
+  CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup);
+  for (const auto &Info : CSI) {
+    unsigned FrameIdx = Info.getFrameIdx();
+    if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector)
+      continue;
+
+    assert(!Info.isSpilledToReg() && "Spilling to registers not implemented");
+    int64_t Offset = MFI.getObjectOffset(FrameIdx) - AFL.getOffsetOfLocalArea();
+    CFIBuilder.buildOffset(Info.getReg(), Offset);
+  }
+}
+
+void AArch64PrologueEmitter::emitCalleeSavedSVELocations(
+    MachineBasicBlock::iterator MBBI) const {
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  // Add callee saved registers to move list.
+  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+  if (CSI.empty())
+    return;
+
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
+  const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+  AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
+  CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup);
+
+  std::optional<int64_t> IncomingVGOffsetFromDefCFA;
+  if (AFL.requiresSaveVG(MF)) {
+    auto IncomingVG = *find_if(
+        reverse(CSI), [](auto &Info) { return Info.getReg() == AArch64::VG; });
+    IncomingVGOffsetFromDefCFA = MFI.getObjectOffset(IncomingVG.getFrameIdx()) -
+                                 AFL.getOffsetOfLocalArea();
+  }
+
+  for (const auto &Info : CSI) {
+    if (MFI.getStackID(Info.getFrameIdx()) != TargetStackID::ScalableVector)
+      continue;
+
+    // Not all unwinders may know about SVE registers, so assume the lowest
+    // common denominator.
+    assert(!Info.isSpilledToReg() && "Spilling to registers not implemented");
+    MCRegister Reg = Info.getReg();
+    if (!static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, Reg))
+      continue;
+
+    StackOffset Offset =
+        StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) -
+        StackOffset::getFixed(AFI.getCalleeSavedStackSize(MFI));
+
+    CFIBuilder.insertCFIInst(
+        createCFAOffset(TRI, Reg, Offset, IncomingVGOffsetFromDefCFA));
+  }
+}
+
+} // namespace llvm
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h
new file mode 100644
index 000000000000..94029ede60c7
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h
@@ -0,0 +1,111 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the declaration of the AArch64PrologueEmitter class,
+/// which is is used to emit the prologue on AArch64.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64PROLOGUEEPILOGUE_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64PROLOGUEEPILOGUE_H
+
+#include "AArch64RegisterInfo.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+class AArch64Subtarget;
+class AArch64FunctionInfo;
+class AArch64FrameLowering;
+
+/// A helper class for emitting the prologue. Substantial new functionality
+/// should be factored into a new method. Where possible "emit*" methods should
+/// be const, and any flags that change how the prologue is emitted should be
+/// set in the constructor.
+class AArch64PrologueEmitter {
+public:
+  AArch64PrologueEmitter(MachineFunction &MF, MachineBasicBlock &MBB,
+                         const AArch64FrameLowering &AFL);
+
+  /// Emit the prologue.
+  void emitPrologue();
+
+  ~AArch64PrologueEmitter() {
+    MF.setHasWinCFI(HasWinCFI);
+#ifndef NDEBUG
+    verifyPrologueClobbers();
+#endif
+  }
+
+private:
+  void emitShadowCallStackPrologue(MachineBasicBlock::iterator MBBI,
+                                   const DebugLoc &DL) const;
+
+  void emitSwiftAsyncContextFramePointer(MachineBasicBlock::iterator MBBI,
+                                         const DebugLoc &DL) const;
+
+  void emitEmptyStackFramePrologue(int64_t NumBytes,
+                                   MachineBasicBlock::iterator MBBI,
+                                   const DebugLoc &DL) const;
+
+  void emitFramePointerSetup(MachineBasicBlock::iterator MBBI,
+                             const DebugLoc &DL, unsigned FixedObject);
+
+  void emitDefineCFAWithFP(MachineBasicBlock::iterator MBBI,
+                           unsigned FixedObject) const;
+
+  void emitWindowsStackProbe(MachineBasicBlock::iterator MBBI,
+                             const DebugLoc &DL, int64_t &NumBytes,
+                             int64_t RealignmentPadding) const;
+
+  void emitCalleeSavedGPRLocations(MachineBasicBlock::iterator MBBI) const;
+  void emitCalleeSavedSVELocations(MachineBasicBlock::iterator MBBI) const;
+
+  void determineLocalsStackSize(uint64_t StackSize, uint64_t PrologueSaveSize);
+
+  MachineFunction &MF;
+  MachineBasicBlock &MBB;
+
+  const Function &F;
+  const MachineFrameInfo &MFI;
+  const AArch64Subtarget &Subtarget;
+  const AArch64FrameLowering &AFL;
+  const AArch64RegisterInfo &RegInfo;
+
+#ifndef NDEBUG
+  mutable LivePhysRegs LiveRegs{RegInfo};
+  MachineBasicBlock::iterator PrologueEndI;
+
+  void collectBlockLiveins();
+  void verifyPrologueClobbers() const;
+#endif
+
+  // Prologue flags. These generally should not change outside of the
+  // constructor. Two exceptions are "CombineSPBump" which is set in
+  // determineLocalsStackSize, and "NeedsWinCFI" which is set in
+  // emitFramePointerSetup.
+  bool EmitCFI = false;
+  bool EmitAsyncCFI = false;
+  bool HasFP = false;
+  bool IsFunclet = false;
+  bool CombineSPBump = false;
+  bool HomPrologEpilog = false;
+  bool NeedsWinCFI = false;
+
+  // Note: "HasWinCFI" is mutable as it can change in any "emit" function.
+  mutable bool HasWinCFI = false;
+
+  const TargetInstrInfo *TII = nullptr;
+  AArch64FunctionInfo *AFI = nullptr;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index 1a7609bfee8a..431ed6ec34e7 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -983,7 +983,7 @@ class ZPRRegOp <string Suffix, AsmOperandClass C, ElementSizeEnum Size,
 
 // Note: This hardware mode is enabled in AArch64Subtarget::getHwModeSet()
 // (without the use of the table-gen'd predicates).
-def SMEWithZPRPredicateSpills : HwMode<"", [Predicate<"false">]>;
+def SMEWithZPRPredicateSpills : HwMode<[Predicate<"false">]>;
 
 def PPRSpillFillRI : RegInfoByHwMode<
       [DefaultMode,              SMEWithZPRPredicateSpills],
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 125225df1546..601dc34d74b9 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -61,10 +61,10 @@ let usesCustomInserter = 1, Defs = [SP], Uses = [SP] in {
 def : Pat<(i64 (AArch64AllocateZABuffer GPR64:$size)),
           (AllocateZABuffer $size)>;
 
-def AArch64InitTPIDR2Obj  : SDNode<"AArch64ISD::INIT_TPIDR2OBJ", SDTypeProfile<0, 1,
-                              [SDTCisInt<0>]>, [SDNPHasChain, SDNPMayStore]>;
+def AArch64InitTPIDR2Obj  : SDNode<"AArch64ISD::INIT_TPIDR2OBJ", SDTypeProfile<0, 2,
+                              [SDTCisInt<0>, SDTCisInt<1>]>, [SDNPHasChain, SDNPMayStore]>;
 let usesCustomInserter = 1 in {
-  def InitTPIDR2Obj : Pseudo<(outs), (ins GPR64:$buffer), [(AArch64InitTPIDR2Obj GPR64:$buffer)]>, Sched<[WriteI]> {}
+  def InitTPIDR2Obj : Pseudo<(outs), (ins GPR64:$buffer, GPR64:$save_slices), [(AArch64InitTPIDR2Obj GPR64:$buffer, GPR64:$save_slices)]>, Sched<[WriteI]> {}
 }
 
 // Nodes to allocate a save buffer for SME.
@@ -93,6 +93,8 @@ let hasSideEffects = 1, isMeta = 1 in {
   def RequiresZASavePseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
 }
 
+def SMEStateAllocPseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
+
 def CommitZASavePseudo
   : Pseudo<(outs),
            (ins GPR64:$tpidr2_el0, i1imm:$zero_za, i64imm:$commit_routine, variable_ops), []>,
@@ -108,6 +110,11 @@ def AArch64_requires_za_save
            [SDNPHasChain, SDNPInGlue]>;
 def : Pat<(AArch64_requires_za_save), (RequiresZASavePseudo)>;
 
+def AArch64_sme_state_alloc
+  : SDNode<"AArch64ISD::SME_STATE_ALLOC", SDTypeProfile<0, 0,[]>,
+           [SDNPHasChain]>;
+def : Pat<(AArch64_sme_state_alloc), (SMEStateAllocPseudo)>;
+
 //===----------------------------------------------------------------------===//
 // Instruction naming conventions.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index eeb47b4d9975..7604ffdc9f64 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -412,6 +412,7 @@ def SDT_AArch64PTest : SDTypeProfile<1, 2, [
 ]>;
 def AArch64ptest     : SDNode<"AArch64ISD::PTEST", SDT_AArch64PTest>;
 def AArch64ptest_any : SDNode<"AArch64ISD::PTEST_ANY", SDT_AArch64PTest>;
+def AArch64ptest_first : SDNode<"AArch64ISD::PTEST_FIRST", SDT_AArch64PTest>;
 
 def SDT_AArch64DUP_PRED  : SDTypeProfile<1, 3,
   [SDTCisVec<0>, SDTCisSameAs<0, 3>, SDTCisVec<1>, SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0, 1>]>;
@@ -650,7 +651,7 @@ let Predicates = [HasSVE_or_SME, UseExperimentalZeroingPseudos] in {
 
 let Predicates = [HasSVE_or_SME] in {
   defm ADD_ZI   : sve_int_arith_imm0<0b000, "add", add>;
-  defm SUB_ZI   : sve_int_arith_imm0<0b001, "sub", sub>;
+  defm SUB_ZI   : sve_int_arith_imm0<0b001, "sub", sub, add>;
   defm SUBR_ZI  : sve_int_arith_imm0<0b011, "subr", AArch64subr>;
   defm SQADD_ZI : sve_int_arith_imm0_ssat<0b100, "sqadd", saddsat, ssubsat>;
   defm UQADD_ZI : sve_int_arith_imm0<0b101, "uqadd", uaddsat>;
@@ -1071,7 +1072,7 @@ let Predicates = [HasSVE_or_SME] in {
   defm BRKB_PPmP  : sve_int_break_m<0b101, "brkb",  int_aarch64_sve_brkb>;
   defm BRKBS_PPzP : sve_int_break_z<0b110, "brkbs", null_frag>;
 
-  defm PTEST_PP : sve_int_ptest<0b010000, "ptest", AArch64ptest, AArch64ptest_any>;
+  defm PTEST_PP : sve_int_ptest<0b010000, "ptest", AArch64ptest, AArch64ptest_any, AArch64ptest_first>;
   defm PFALSE   : sve_int_pfalse<0b000000, "pfalse">;
   defm PFIRST   : sve_int_pfirst<0b00000, "pfirst", int_aarch64_sve_pfirst>;
   defm PNEXT    : sve_int_pnext<0b00110, "pnext", int_aarch64_sve_pnext>;
@@ -4141,8 +4142,8 @@ let Predicates = [HasSVE2_or_SME] in {
   defm WHILEHI_PXX : sve_int_while8_rr<0b101, "whilehi", int_aarch64_sve_whilehi, get_active_lane_mask>;
 
   // SVE2 pointer conflict compare
-  defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr", "int_aarch64_sve_whilewr">;
-  defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw", "int_aarch64_sve_whilerw">;
+  defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr", loop_dependence_war_mask>;
+  defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw", loop_dependence_raw_mask>;
 } // End HasSVE2_or_SME
 
 let Predicates = [HasSVEAES, HasNonStreamingSVE_or_SSVE_AES] in {
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 0f4f0129e9cd..98e0a1180510 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -91,6 +91,10 @@ static cl::opt<bool> EnableZPRPredicateSpills(
     cl::desc(
         "Enables spilling/reloading SVE predicates as data vectors (ZPRs)"));
 
+static cl::opt<unsigned>
+    VScaleForTuningOpt("sve-vscale-for-tuning", cl::Hidden,
+                       cl::desc("Force a vscale for tuning factor for SVE"));
+
 // Subreg liveness tracking is disabled by default for now until all issues
 // are ironed out. This option allows the feature to be used in tests.
 static cl::opt<bool>
@@ -364,6 +368,8 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
 
   if (AArch64MinimumJumpTableEntries.getNumOccurrences() > 0 || !HasMinSize)
     MinimumJumpTableEntries = AArch64MinimumJumpTableEntries;
+  if (VScaleForTuningOpt.getNumOccurrences() > 0)
+    VScaleForTuning = VScaleForTuningOpt;
 }
 
 AArch64Subtarget::AArch64Subtarget(const Triple &TT, StringRef CPU,
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 01c0bcc3a6a7..671df35cd379 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -246,8 +246,8 @@ public:
   /// Return true if the CPU supports any kind of instruction fusion.
   bool hasFusion() const {
     return hasArithmeticBccFusion() || hasArithmeticCbzFusion() ||
-           hasFuseAES() || hasFuseArithmeticLogic() || hasFuseCCSelect() ||
-           hasFuseAdrpAdd() || hasFuseLiterals();
+           hasFuseAES() || hasFuseArithmeticLogic() || hasFuseCmpCSel() ||
+           hasFuseCmpCSet() || hasFuseAdrpAdd() || hasFuseLiterals();
   }
 
   unsigned getEpilogueVectorizationMinVF() const {
diff --git a/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
index 1b0e90b0e0dc..65b752ed40c9 100644
--- a/llvm/lib/Target/AArch64/AArch64SystemOperands.td
+++ b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
@@ -362,7 +362,7 @@ def lookupTSBByName : SearchIndex {
   let Key = ["Name"];
 }
 
-def : TSB<"csync", 0>;
+def : TSB<"csync", 2>;
 
 //===----------------------------------------------------------------------===//
 // PRFM (prefetch) instruction options.
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index e67bd5869ccd..4650b2d0c815 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -589,7 +589,8 @@ void AArch64TargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
 
   PB.registerLateLoopOptimizationsEPCallback(
       [=](LoopPassManager &LPM, OptimizationLevel Level) {
-        LPM.addPass(LoopIdiomVectorizePass());
+        if (Level != OptimizationLevel::O0)
+          LPM.addPass(LoopIdiomVectorizePass());
       });
   if (getTargetTriple().isOSWindows())
     PB.registerPipelineEarlySimplificationEPCallback(
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 490f6391c15a..92321a76dbd8 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -25,6 +25,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/TargetParser/AArch64TargetParser.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
 #include <algorithm>
 #include <optional>
@@ -4409,6 +4410,32 @@ AArch64TTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
   return 1;
 }
 
+/// Check whether Opcode1 has less throughput according to the scheduling
+/// model than Opcode2.
+bool AArch64TTIImpl::hasKnownLowerThroughputFromSchedulingModel(
+    unsigned Opcode1, unsigned Opcode2) const {
+  const MCSchedModel &Sched = ST->getSchedModel();
+  const TargetInstrInfo *TII = ST->getInstrInfo();
+  if (!Sched.hasInstrSchedModel())
+    return false;
+
+  const MCSchedClassDesc *SCD1 =
+      Sched.getSchedClassDesc(TII->get(Opcode1).getSchedClass());
+  const MCSchedClassDesc *SCD2 =
+      Sched.getSchedClassDesc(TII->get(Opcode2).getSchedClass());
+  // We cannot handle variant scheduling classes without an MI. If we need to
+  // support them for any of the instructions we query the information of we
+  // might need to add a way to resolve them without a MI or not use the
+  // scheduling info.
+  assert(!SCD1->isVariant() && !SCD2->isVariant() &&
+         "Cannot handle variant scheduling classes without an MI");
+  if (!SCD1->isValid() || !SCD2->isValid())
+    return false;
+
+  return MCSchedModel::getReciprocalThroughput(*ST, *SCD1) >
+         MCSchedModel::getReciprocalThroughput(*ST, *SCD2);
+}
+
 InstructionCost AArch64TTIImpl::getCmpSelInstrCost(
     unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
     TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
@@ -4506,6 +4533,12 @@ InstructionCost AArch64TTIImpl::getCmpSelInstrCost(
              (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
       Factor = 3; // fcmxx+fcmyy+or
 
+    if (isa<ScalableVectorType>(ValTy) &&
+        CostKind == TTI::TCK_RecipThroughput &&
+        hasKnownLowerThroughputFromSchedulingModel(AArch64::FCMEQ_PPzZZ_S,
+                                                   AArch64::FCMEQv4f32))
+      Factor *= 2;
+
     return Factor * (CostKind == TTI::TCK_Latency ? 2 : LT.first);
   }
 
@@ -4937,6 +4970,23 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
   if (!L->getExitBlock())
     return;
 
+  // Check if the loop contains any reductions that could be parallelized when
+  // unrolling. If so, enable partial unrolling, if the trip count is know to be
+  // a multiple of 2.
+  bool HasParellelizableReductions =
+      L->getNumBlocks() == 1 &&
+      any_of(L->getHeader()->phis(),
+             [&SE, L](PHINode &Phi) {
+               return canParallelizeReductionWhenUnrolling(Phi, L, &SE);
+             }) &&
+      isLoopSizeWithinBudget(L, TTI, 12, nullptr);
+  if (HasParellelizableReductions &&
+      SE.getSmallConstantTripMultiple(L, L->getExitingBlock()) % 2 == 0) {
+    UP.Partial = true;
+    UP.MaxCount = 4;
+    UP.AddAdditionalAccumulators = true;
+  }
+
   const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
   if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC) ||
       (SE.getSmallConstantMaxTripCount(L) > 0 &&
@@ -4952,6 +5002,12 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
   // Limit to loops with trip counts that are cheap to expand.
   UP.SCEVExpansionBudget = 1;
 
+  if (HasParellelizableReductions) {
+    UP.Runtime = true;
+    UP.DefaultUnrollRuntimeCount = 4;
+    UP.AddAdditionalAccumulators = true;
+  }
+
   // Try to unroll small loops, of few-blocks with low budget, if they have
   // load/store dependencies, to expose more parallel memory access streams,
   // or if they do little work inside a block (i.e. load -> X -> store pattern).
@@ -5486,13 +5542,14 @@ InstructionCost AArch64TTIImpl::getExtendedReductionCost(
 }
 
 InstructionCost
-AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
-                                       VectorType *VecTy,
+AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
+                                       Type *ResTy, VectorType *VecTy,
                                        TTI::TargetCostKind CostKind) const {
   EVT VecVT = TLI->getValueType(DL, VecTy);
   EVT ResVT = TLI->getValueType(DL, ResTy);
 
-  if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple()) {
+  if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple() &&
+      RedOpcode == Instruction::Add) {
     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
 
     // The legal cases with dotprod are
@@ -5503,7 +5560,8 @@ AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
       return LT.first + 2;
   }
 
-  return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, VecTy, CostKind);
+  return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, VecTy,
+                                       CostKind);
 }
 
 InstructionCost
@@ -5750,11 +5808,14 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
 
   Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
   bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
-  // A subvector extract can be implemented with an ext (or trivial extract, if
-  // from lane 0). This currently only handles low or high extracts to prevent
-  // SLP vectorizer regressions.
+  // A subvector extract can be implemented with a NEON/SVE ext (or trivial
+  // extract, if from lane 0) for 128-bit NEON vectors or legal SVE vectors.
+  // This currently only handles low or high extracts to prevent SLP vectorizer
+  // regressions.
+  // Note that SVE's ext instruction is destructive, but it can be fused with
+  // a movprfx to act like a constructive instruction.
   if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
-    if (LT.second.is128BitVector() &&
+    if (LT.second.getFixedSizeInBits() >= 128 &&
         cast<FixedVectorType>(SubTp)->getNumElements() ==
             LT.second.getVectorNumElements() / 2) {
       if (Index == 0)
@@ -6017,9 +6078,15 @@ static bool containsDecreasingPointers(Loop *TheLoop,
   return false;
 }
 
-bool AArch64TTIImpl::preferFixedOverScalableIfEqualCost() const {
+bool AArch64TTIImpl::preferFixedOverScalableIfEqualCost(bool IsEpilogue) const {
   if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences())
     return SVEPreferFixedOverScalableIfEqualCost;
+  // For cases like post-LTO vectorization, when we eventually know the trip
+  // count, epilogue with fixed-width vectorization can be deleted if the trip
+  // count is less than the epilogue iterations. That's why we prefer
+  // fixed-width vectorization in epilogue in case of equal costs.
+  if (IsEpilogue)
+    return true;
   return ST->useFixedOverScalableIfEqualCost();
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 42ae962b3b42..fe2e849258e3 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -174,6 +174,11 @@ public:
 
   bool prefersVectorizedAddressing() const override;
 
+  /// Check whether Opcode1 has less throughput according to the scheduling
+  /// model than Opcode2.
+  bool hasKnownLowerThroughputFromSchedulingModel(unsigned Opcode1,
+                                                  unsigned Opcode2) const;
+
   InstructionCost
   getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
                         unsigned AddressSpace,
@@ -424,7 +429,7 @@ public:
     return TailFoldingStyle::DataWithoutLaneMask;
   }
 
-  bool preferFixedOverScalableIfEqualCost() const override;
+  bool preferFixedOverScalableIfEqualCost(bool IsEpilogue) const override;
 
   unsigned getEpilogueVectorizationMinVF() const override;
 
@@ -460,7 +465,7 @@ public:
                            TTI::TargetCostKind CostKind) const override;
 
   InstructionCost getMulAccReductionCost(
-      bool IsUnsigned, Type *ResTy, VectorType *Ty,
+      bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty,
       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const override;
 
   InstructionCost
diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt
index 803943fd57c4..a8185358d6df 100644
--- a/llvm/lib/Target/AArch64/CMakeLists.txt
+++ b/llvm/lib/Target/AArch64/CMakeLists.txt
@@ -7,7 +7,8 @@ tablegen(LLVM AArch64GenAsmWriter.inc -gen-asm-writer)
 tablegen(LLVM AArch64GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1)
 tablegen(LLVM AArch64GenCallingConv.inc -gen-callingconv)
 tablegen(LLVM AArch64GenDAGISel.inc -gen-dag-isel)
-tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler
+              -ignore-non-decodable-operands)
 tablegen(LLVM AArch64GenFastISel.inc -gen-fast-isel)
 tablegen(LLVM AArch64GenGlobalISel.inc -gen-global-isel)
 tablegen(LLVM AArch64GenO0PreLegalizeGICombiner.inc -gen-global-isel-combiner
@@ -91,6 +92,7 @@ add_llvm_target(AArch64CodeGen
   SVEIntrinsicOpts.cpp
   MachineSMEABIPass.cpp
   AArch64SIMDInstrOpt.cpp
+  AArch64PrologueEpilogue.cpp
 
   DEPENDS
   intrinsics_gen
@@ -107,6 +109,7 @@ add_llvm_target(AArch64CodeGen
   Core
   GlobalISel
   MC
+  Passes
   Scalar
   SelectionDAG
   Support
diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index 323db2a0728e..aa1c1c882e22 100644
--- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -35,308 +35,14 @@ using namespace llvm::MCD;
 // Pull DecodeStatus and its enum values into the global namespace.
 using DecodeStatus = MCDisassembler::DecodeStatus;
 
-// Forward declare these because the autogenerated code will reference them.
-// Definitions are further down.
-template <unsigned RegClassID, unsigned FirstReg, unsigned NumRegsInClass>
-static DecodeStatus DecodeSimpleRegisterClass(MCInst &Inst, unsigned RegNo,
-                                              uint64_t Address,
-                                              const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeGPR64x8ClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
-                                const MCDisassembler *Decoder);
-template <unsigned Min, unsigned Max>
-static DecodeStatus DecodeZPRMul2_MinMax(MCInst &Inst, unsigned RegNo,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder);
-static DecodeStatus DecodeZK(MCInst &Inst, unsigned RegNo, uint64_t Address,
-                             const MCDisassembler *Decoder);
-template <unsigned Min, unsigned Max>
-static DecodeStatus DecodeZPR2Mul2RegisterClass(MCInst &Inst, unsigned RegNo,
-                                                uint64_t Address,
-                                                const void *Decoder);
-static DecodeStatus DecodeZPR4Mul4RegisterClass(MCInst &Inst, unsigned RegNo,
-                                                uint64_t Address,
-                                                const void *Decoder);
-template <unsigned NumBitsForTile>
-static DecodeStatus DecodeMatrixTile(MCInst &Inst, unsigned RegNo,
-                                     uint64_t Address,
-                                     const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeMatrixTileListRegisterClass(MCInst &Inst, unsigned RegMask,
-                                  uint64_t Address,
-                                  const MCDisassembler *Decoder);
-static DecodeStatus DecodePPR2Mul2RegisterClass(MCInst &Inst, unsigned RegNo,
-                                                uint64_t Address,
-                                                const void *Decoder);
-
-static DecodeStatus DecodeFixedPointScaleImm32(MCInst &Inst, unsigned Imm,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder);
-static DecodeStatus DecodeFixedPointScaleImm64(MCInst &Inst, unsigned Imm,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder);
-static DecodeStatus DecodePCRelLabel16(MCInst &Inst, unsigned Imm,
-                                       uint64_t Address,
-                                       const MCDisassembler *Decoder);
-static DecodeStatus DecodePCRelLabel19(MCInst &Inst, unsigned Imm,
-                                       uint64_t Address,
-                                       const MCDisassembler *Decoder);
-static DecodeStatus DecodePCRelLabel9(MCInst &Inst, unsigned Imm,
-                                      uint64_t Address,
-                                      const MCDisassembler *Decoder);
-static DecodeStatus DecodeMemExtend(MCInst &Inst, unsigned Imm,
-                                    uint64_t Address,
-                                    const MCDisassembler *Decoder);
-static DecodeStatus DecodeMRSSystemRegister(MCInst &Inst, unsigned Imm,
-                                            uint64_t Address,
-                                            const MCDisassembler *Decoder);
-static DecodeStatus DecodeMSRSystemRegister(MCInst &Inst, unsigned Imm,
-                                            uint64_t Address,
-                                            const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeThreeAddrSRegInstruction(MCInst &Inst, uint32_t insn, uint64_t Address,
-                               const MCDisassembler *Decoder);
-static DecodeStatus DecodeMoveImmInstruction(MCInst &Inst, uint32_t insn,
-                                             uint64_t Address,
-                                             const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeUnsignedLdStInstruction(MCInst &Inst, uint32_t insn, uint64_t Address,
-                              const MCDisassembler *Decoder);
-static DecodeStatus DecodeSignedLdStInstruction(MCInst &Inst, uint32_t insn,
-                                                uint64_t Address,
-                                                const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeExclusiveLdStInstruction(MCInst &Inst, uint32_t insn, uint64_t Address,
-                               const MCDisassembler *Decoder);
-static DecodeStatus DecodePairLdStInstruction(MCInst &Inst, uint32_t insn,
-                                              uint64_t Address,
-                                              const MCDisassembler *Decoder);
-static DecodeStatus DecodeAuthLoadInstruction(MCInst &Inst, uint32_t insn,
-                                              uint64_t Address,
-                                              const MCDisassembler *Decoder);
-static DecodeStatus DecodeAddSubERegInstruction(MCInst &Inst, uint32_t insn,
-                                                uint64_t Address,
-                                                const MCDisassembler *Decoder);
-static DecodeStatus DecodeLogicalImmInstruction(MCInst &Inst, uint32_t insn,
-                                                uint64_t Address,
-                                                const MCDisassembler *Decoder);
-static DecodeStatus DecodeModImmInstruction(MCInst &Inst, uint32_t insn,
-                                            uint64_t Address,
-                                            const MCDisassembler *Decoder);
-static DecodeStatus DecodeModImmTiedInstruction(MCInst &Inst, uint32_t insn,
-                                                uint64_t Address,
-                                                const MCDisassembler *Decoder);
-static DecodeStatus DecodeAdrInstruction(MCInst &Inst, uint32_t insn,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder);
-static DecodeStatus DecodeAddSubImmShift(MCInst &Inst, uint32_t insn,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder);
-static DecodeStatus DecodeUnconditionalBranch(MCInst &Inst, uint32_t insn,
-                                              uint64_t Address,
-                                              const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeSystemPStateImm0_15Instruction(MCInst &Inst, uint32_t insn,
-                                     uint64_t Address,
-                                     const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeSystemPStateImm0_1Instruction(MCInst &Inst, uint32_t insn,
-                                    uint64_t Address,
-                                    const MCDisassembler *Decoder);
-static DecodeStatus DecodeTestAndBranch(MCInst &Inst, uint32_t insn,
-                                        uint64_t Address,
-                                        const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeFMOVLaneInstruction(MCInst &Inst, unsigned Insn,
-                                              uint64_t Address,
-                                              const MCDisassembler *Decoder);
-static DecodeStatus DecodeVecShiftR64Imm(MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr,
-                                         const MCDisassembler *Decoder);
-static DecodeStatus DecodeVecShiftR64ImmNarrow(MCInst &Inst, unsigned Imm,
-                                               uint64_t Addr,
-                                               const MCDisassembler *Decoder);
-static DecodeStatus DecodeVecShiftR32Imm(MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr,
-                                         const MCDisassembler *Decoder);
-static DecodeStatus DecodeVecShiftR32ImmNarrow(MCInst &Inst, unsigned Imm,
-                                               uint64_t Addr,
-                                               const MCDisassembler *Decoder);
-static DecodeStatus DecodeVecShiftR16Imm(MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr,
-                                         const MCDisassembler *Decoder);
-static DecodeStatus DecodeVecShiftR16ImmNarrow(MCInst &Inst, unsigned Imm,
-                                               uint64_t Addr,
-                                               const MCDisassembler *Decoder);
-static DecodeStatus DecodeVecShiftR8Imm(MCInst &Inst, unsigned Imm,
-                                        uint64_t Addr,
-                                        const MCDisassembler *Decoder);
-static DecodeStatus DecodeVecShiftL64Imm(MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr,
-                                         const MCDisassembler *Decoder);
-static DecodeStatus DecodeVecShiftL32Imm(MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr,
-                                         const MCDisassembler *Decoder);
-static DecodeStatus DecodeVecShiftL16Imm(MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr,
-                                         const MCDisassembler *Decoder);
-static DecodeStatus DecodeVecShiftL8Imm(MCInst &Inst, unsigned Imm,
-                                        uint64_t Addr,
-                                        const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeWSeqPairsClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr,
-                                  const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeXSeqPairsClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr,
-                                  const MCDisassembler *Decoder);
-static DecodeStatus DecodeSyspXzrInstruction(MCInst &Inst, uint32_t insn,
-                                             uint64_t Addr,
-                                             const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeSVELogicalImmInstruction(MCInst &Inst, uint32_t insn, uint64_t Address,
-                               const MCDisassembler *Decoder);
 template <int Bits>
 static DecodeStatus DecodeSImm(MCInst &Inst, uint64_t Imm, uint64_t Address,
                                const MCDisassembler *Decoder);
-template <int ElementWidth>
-static DecodeStatus DecodeImm8OptLsl(MCInst &Inst, unsigned Imm, uint64_t Addr,
-                                     const MCDisassembler *Decoder);
-static DecodeStatus DecodeSVEIncDecImm(MCInst &Inst, unsigned Imm,
-                                       uint64_t Addr,
-                                       const MCDisassembler *Decoder);
-static DecodeStatus DecodeSVCROp(MCInst &Inst, unsigned Imm, uint64_t Address,
-                                 const MCDisassembler *Decoder);
-static DecodeStatus DecodeCPYMemOpInstruction(MCInst &Inst, uint32_t insn,
-                                              uint64_t Addr,
-                                              const MCDisassembler *Decoder);
-static DecodeStatus DecodeSETMemOpInstruction(MCInst &Inst, uint32_t insn,
-                                              uint64_t Addr,
-                                              const MCDisassembler *Decoder);
-static DecodeStatus DecodePRFMRegInstruction(MCInst &Inst, uint32_t insn,
-                                             uint64_t Address,
-                                             const MCDisassembler *Decoder);
-
-#include "AArch64GenDisassemblerTables.inc"
-#include "AArch64GenInstrInfo.inc"
 
 #define Success MCDisassembler::Success
 #define Fail MCDisassembler::Fail
 #define SoftFail MCDisassembler::SoftFail
 
-static MCDisassembler *createAArch64Disassembler(const Target &T,
-                                                 const MCSubtargetInfo &STI,
-                                                 MCContext &Ctx) {
-
-  return new AArch64Disassembler(STI, Ctx, T.createMCInstrInfo());
-}
-
-DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size,
-                                                 ArrayRef<uint8_t> Bytes,
-                                                 uint64_t Address,
-                                                 raw_ostream &CS) const {
-  CommentStream = &CS;
-
-  Size = 0;
-  // We want to read exactly 4 bytes of data.
-  if (Bytes.size() < 4)
-    return Fail;
-  Size = 4;
-
-  // Encoded as a small-endian 32-bit word in the stream.
-  uint32_t Insn =
-      (Bytes[3] << 24) | (Bytes[2] << 16) | (Bytes[1] << 8) | (Bytes[0] << 0);
-
-  const uint8_t *Tables[] = {DecoderTable32, DecoderTableFallback32};
-
-  for (const auto *Table : Tables) {
-    DecodeStatus Result =
-        decodeInstruction(Table, MI, Insn, Address, this, STI);
-
-    const MCInstrDesc &Desc = MCII->get(MI.getOpcode());
-
-    // For Scalable Matrix Extension (SME) instructions that have an implicit
-    // operand for the accumulator (ZA) or implicit immediate zero which isn't
-    // encoded, manually insert operand.
-    for (unsigned i = 0; i < Desc.getNumOperands(); i++) {
-      if (Desc.operands()[i].OperandType == MCOI::OPERAND_REGISTER) {
-        switch (Desc.operands()[i].RegClass) {
-        default:
-          break;
-        case AArch64::MPRRegClassID:
-          MI.insert(MI.begin() + i, MCOperand::createReg(AArch64::ZA));
-          break;
-        case AArch64::MPR8RegClassID:
-          MI.insert(MI.begin() + i, MCOperand::createReg(AArch64::ZAB0));
-          break;
-        case AArch64::ZTRRegClassID:
-          MI.insert(MI.begin() + i, MCOperand::createReg(AArch64::ZT0));
-          break;
-        }
-      } else if (Desc.operands()[i].OperandType ==
-                 AArch64::OPERAND_IMPLICIT_IMM_0) {
-        MI.insert(MI.begin() + i, MCOperand::createImm(0));
-      }
-    }
-
-    if (MI.getOpcode() == AArch64::LDR_ZA ||
-        MI.getOpcode() == AArch64::STR_ZA) {
-      // Spill and fill instructions have a single immediate used for both
-      // the vector select offset and optional memory offset. Replicate
-      // the decoded immediate.
-      const MCOperand &Imm4Op = MI.getOperand(2);
-      assert(Imm4Op.isImm() && "Unexpected operand type!");
-      MI.addOperand(Imm4Op);
-    }
-
-    if (Result != MCDisassembler::Fail)
-      return Result;
-  }
-
-  return MCDisassembler::Fail;
-}
-
-uint64_t AArch64Disassembler::suggestBytesToSkip(ArrayRef<uint8_t> Bytes,
-                                                 uint64_t Address) const {
-  // AArch64 instructions are always 4 bytes wide, so there's no point
-  // in skipping any smaller number of bytes if an instruction can't
-  // be decoded.
-  return 4;
-}
-
-static MCSymbolizer *
-createAArch64ExternalSymbolizer(const Triple &TT, LLVMOpInfoCallback GetOpInfo,
-                                LLVMSymbolLookupCallback SymbolLookUp,
-                                void *DisInfo, MCContext *Ctx,
-                                std::unique_ptr<MCRelocationInfo> &&RelInfo) {
-  return new AArch64ExternalSymbolizer(*Ctx, std::move(RelInfo), GetOpInfo,
-                                       SymbolLookUp, DisInfo);
-}
-
-extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
-LLVMInitializeAArch64Disassembler() {
-  TargetRegistry::RegisterMCDisassembler(getTheAArch64leTarget(),
-                                         createAArch64Disassembler);
-  TargetRegistry::RegisterMCDisassembler(getTheAArch64beTarget(),
-                                         createAArch64Disassembler);
-  TargetRegistry::RegisterMCSymbolizer(getTheAArch64leTarget(),
-                                       createAArch64ExternalSymbolizer);
-  TargetRegistry::RegisterMCSymbolizer(getTheAArch64beTarget(),
-                                       createAArch64ExternalSymbolizer);
-  TargetRegistry::RegisterMCDisassembler(getTheAArch64_32Target(),
-                                         createAArch64Disassembler);
-  TargetRegistry::RegisterMCSymbolizer(getTheAArch64_32Target(),
-                                       createAArch64ExternalSymbolizer);
-
-  TargetRegistry::RegisterMCDisassembler(getTheARM64Target(),
-                                         createAArch64Disassembler);
-  TargetRegistry::RegisterMCSymbolizer(getTheARM64Target(),
-                                       createAArch64ExternalSymbolizer);
-  TargetRegistry::RegisterMCDisassembler(getTheARM64_32Target(),
-                                         createAArch64Disassembler);
-  TargetRegistry::RegisterMCSymbolizer(getTheARM64_32Target(),
-                                       createAArch64ExternalSymbolizer);
-}
-
 template <unsigned RegClassID, unsigned FirstReg, unsigned NumRegsInClass>
 static DecodeStatus DecodeSimpleRegisterClass(MCInst &Inst, unsigned RegNo,
                                               uint64_t Address,
@@ -492,11 +198,7 @@ static DecodeStatus DecodePCRelLabel16(MCInst &Inst, unsigned Imm,
 static DecodeStatus DecodePCRelLabel19(MCInst &Inst, unsigned Imm,
                                        uint64_t Addr,
                                        const MCDisassembler *Decoder) {
-  int64_t ImmVal = Imm;
-
-  // Sign-extend 19-bit immediate.
-  if (ImmVal & (1 << (19 - 1)))
-    ImmVal |= ~((1LL << 19) - 1);
+  int64_t ImmVal = SignExtend64<19>(Imm);
 
   if (!Decoder->tryAddingSymbolicOperand(
           Inst, ImmVal * 4, Addr, Inst.getOpcode() != AArch64::LDRXl, 0, 0, 4))
@@ -506,11 +208,7 @@ static DecodeStatus DecodePCRelLabel19(MCInst &Inst, unsigned Imm,
 
 static DecodeStatus DecodePCRelLabel9(MCInst &Inst, unsigned Imm, uint64_t Addr,
                                       const MCDisassembler *Decoder) {
-  int64_t ImmVal = Imm;
-
-  // Sign-extend 9-bit immediate.
-  if (ImmVal & (1 << (9 - 1)))
-    ImmVal |= ~((1LL << 9) - 1);
+  int64_t ImmVal = SignExtend64<9>(Imm);
 
   if (!Decoder->tryAddingSymbolicOperand(Inst, (ImmVal * 4), Addr,
                                          /*IsBranch=*/true, 0, 0, 4))
@@ -827,12 +525,7 @@ static DecodeStatus DecodeSignedLdStInstruction(MCInst &Inst, uint32_t insn,
                                                 const MCDisassembler *Decoder) {
   unsigned Rt = fieldFromInstruction(insn, 0, 5);
   unsigned Rn = fieldFromInstruction(insn, 5, 5);
-  int64_t offset = fieldFromInstruction(insn, 12, 9);
-
-  // offset is a 9-bit signed immediate, so sign extend it to
-  // fill the unsigned.
-  if (offset & (1 << (9 - 1)))
-    offset |= ~((1LL << 9) - 1);
+  int64_t offset = SignExtend64<9>(fieldFromInstruction(insn, 12, 9));
 
   // First operand is always the writeback to the address register, if needed.
   switch (Inst.getOpcode()) {
@@ -1129,14 +822,9 @@ static DecodeStatus DecodePairLdStInstruction(MCInst &Inst, uint32_t insn,
   unsigned Rt = fieldFromInstruction(insn, 0, 5);
   unsigned Rn = fieldFromInstruction(insn, 5, 5);
   unsigned Rt2 = fieldFromInstruction(insn, 10, 5);
-  int64_t offset = fieldFromInstruction(insn, 15, 7);
+  int64_t offset = SignExtend64<7>(fieldFromInstruction(insn, 15, 7));
   bool IsLoad = fieldFromInstruction(insn, 22, 1);
 
-  // offset is a 7-bit signed immediate, so sign extend it to
-  // fill the unsigned.
-  if (offset & (1 << (7 - 1)))
-    offset |= ~((1LL << 7) - 1);
-
   unsigned Opcode = Inst.getOpcode();
   bool NeedsDisjointWritebackTransfer = false;
 
@@ -1505,12 +1193,8 @@ static DecodeStatus DecodeAdrInstruction(MCInst &Inst, uint32_t insn,
                                          uint64_t Addr,
                                          const MCDisassembler *Decoder) {
   unsigned Rd = fieldFromInstruction(insn, 0, 5);
-  int64_t imm = fieldFromInstruction(insn, 5, 19) << 2;
-  imm |= fieldFromInstruction(insn, 29, 2);
-
-  // Sign-extend the 21-bit immediate.
-  if (imm & (1 << (21 - 1)))
-    imm |= ~((1LL << 21) - 1);
+  int64_t imm = SignExtend64<21>((fieldFromInstruction(insn, 5, 19) << 2) |
+                                 fieldFromInstruction(insn, 29, 2));
 
   DecodeSimpleRegisterClass<AArch64::GPR64RegClassID, 0, 32>(Inst, Rd, Addr,
                                                              Decoder);
@@ -1564,11 +1248,7 @@ static DecodeStatus DecodeAddSubImmShift(MCInst &Inst, uint32_t insn,
 static DecodeStatus DecodeUnconditionalBranch(MCInst &Inst, uint32_t insn,
                                               uint64_t Addr,
                                               const MCDisassembler *Decoder) {
-  int64_t imm = fieldFromInstruction(insn, 0, 26);
-
-  // Sign-extend the 26-bit immediate.
-  if (imm & (1 << (26 - 1)))
-    imm |= ~((1LL << 26) - 1);
+  int64_t imm = SignExtend64<26>(fieldFromInstruction(insn, 0, 26));
 
   if (!Decoder->tryAddingSymbolicOperand(Inst, imm * 4, Addr, true, 0, 0, 4))
     Inst.addOperand(MCOperand::createImm(imm));
@@ -1631,11 +1311,7 @@ static DecodeStatus DecodeTestAndBranch(MCInst &Inst, uint32_t insn,
   uint64_t Rt = fieldFromInstruction(insn, 0, 5);
   uint64_t bit = fieldFromInstruction(insn, 31, 1) << 5;
   bit |= fieldFromInstruction(insn, 19, 5);
-  int64_t dst = fieldFromInstruction(insn, 5, 14);
-
-  // Sign-extend 14-bit immediate.
-  if (dst & (1 << (14 - 1)))
-    dst |= ~((1LL << 14) - 1);
+  int64_t dst = SignExtend64<14>(fieldFromInstruction(insn, 5, 14));
 
   if (fieldFromInstruction(insn, 31, 1) == 0)
     DecodeSimpleRegisterClass<AArch64::GPR32RegClassID, 0, 32>(Inst, Rt, Addr,
@@ -1856,3 +1532,129 @@ static DecodeStatus DecodePRFMRegInstruction(MCInst &Inst, uint32_t insn,
 
   return Success;
 }
+
+static DecodeStatus
+DecodeSMESpillFillInstruction(MCInst &Inst, uint32_t Bits, uint64_t Addr,
+                              const MCDisassembler *Decoder) {
+  unsigned RvBits = fieldFromInstruction(Bits, 13, 2);
+  unsigned RnBits = fieldFromInstruction(Bits, 5, 5);
+  unsigned Imm4Bits = fieldFromInstruction(Bits, 0, 4);
+
+  DecodeSimpleRegisterClass<AArch64::MatrixIndexGPR32_12_15RegClassID, 0, 4>(
+      Inst, RvBits, Addr, Decoder);
+  Inst.addOperand(MCOperand::createImm(Imm4Bits));
+  DecodeSimpleRegisterClass<AArch64::GPR64spRegClassID, 0, 32>(Inst, RnBits,
+                                                               Addr, Decoder);
+  // Spill and fill instructions have a single immediate used for both
+  // the vector select offset and optional memory offset. Replicate
+  // the decoded immediate.
+  Inst.addOperand(MCOperand::createImm(Imm4Bits));
+  return Success;
+}
+
+#include "AArch64GenDisassemblerTables.inc"
+#include "AArch64GenInstrInfo.inc"
+
+static MCDisassembler *createAArch64Disassembler(const Target &T,
+                                                 const MCSubtargetInfo &STI,
+                                                 MCContext &Ctx) {
+
+  return new AArch64Disassembler(STI, Ctx, T.createMCInstrInfo());
+}
+
+DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                                 ArrayRef<uint8_t> Bytes,
+                                                 uint64_t Address,
+                                                 raw_ostream &CS) const {
+  CommentStream = &CS;
+
+  Size = 0;
+  // We want to read exactly 4 bytes of data.
+  if (Bytes.size() < 4)
+    return Fail;
+  Size = 4;
+
+  // Encoded as a small-endian 32-bit word in the stream.
+  uint32_t Insn =
+      (Bytes[3] << 24) | (Bytes[2] << 16) | (Bytes[1] << 8) | (Bytes[0] << 0);
+
+  const uint8_t *Tables[] = {DecoderTable32, DecoderTableFallback32};
+
+  for (const auto *Table : Tables) {
+    DecodeStatus Result =
+        decodeInstruction(Table, MI, Insn, Address, this, STI);
+
+    const MCInstrDesc &Desc = MCII->get(MI.getOpcode());
+
+    // For Scalable Matrix Extension (SME) instructions that have an implicit
+    // operand for the accumulator (ZA) or implicit immediate zero which isn't
+    // encoded, manually insert operand.
+    for (unsigned i = 0; i < Desc.getNumOperands(); i++) {
+      if (Desc.operands()[i].OperandType == MCOI::OPERAND_REGISTER) {
+        switch (Desc.operands()[i].RegClass) {
+        default:
+          break;
+        case AArch64::MPRRegClassID:
+          MI.insert(MI.begin() + i, MCOperand::createReg(AArch64::ZA));
+          break;
+        case AArch64::MPR8RegClassID:
+          MI.insert(MI.begin() + i, MCOperand::createReg(AArch64::ZAB0));
+          break;
+        case AArch64::ZTRRegClassID:
+          MI.insert(MI.begin() + i, MCOperand::createReg(AArch64::ZT0));
+          break;
+        }
+      } else if (Desc.operands()[i].OperandType ==
+                 AArch64::OPERAND_IMPLICIT_IMM_0) {
+        MI.insert(MI.begin() + i, MCOperand::createImm(0));
+      }
+    }
+
+    if (Result != MCDisassembler::Fail)
+      return Result;
+  }
+
+  return MCDisassembler::Fail;
+}
+
+uint64_t AArch64Disassembler::suggestBytesToSkip(ArrayRef<uint8_t> Bytes,
+                                                 uint64_t Address) const {
+  // AArch64 instructions are always 4 bytes wide, so there's no point
+  // in skipping any smaller number of bytes if an instruction can't
+  // be decoded.
+  return 4;
+}
+
+static MCSymbolizer *
+createAArch64ExternalSymbolizer(const Triple &TT, LLVMOpInfoCallback GetOpInfo,
+                                LLVMSymbolLookupCallback SymbolLookUp,
+                                void *DisInfo, MCContext *Ctx,
+                                std::unique_ptr<MCRelocationInfo> &&RelInfo) {
+  return new AArch64ExternalSymbolizer(*Ctx, std::move(RelInfo), GetOpInfo,
+                                       SymbolLookUp, DisInfo);
+}
+
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAArch64Disassembler() {
+  TargetRegistry::RegisterMCDisassembler(getTheAArch64leTarget(),
+                                         createAArch64Disassembler);
+  TargetRegistry::RegisterMCDisassembler(getTheAArch64beTarget(),
+                                         createAArch64Disassembler);
+  TargetRegistry::RegisterMCSymbolizer(getTheAArch64leTarget(),
+                                       createAArch64ExternalSymbolizer);
+  TargetRegistry::RegisterMCSymbolizer(getTheAArch64beTarget(),
+                                       createAArch64ExternalSymbolizer);
+  TargetRegistry::RegisterMCDisassembler(getTheAArch64_32Target(),
+                                         createAArch64Disassembler);
+  TargetRegistry::RegisterMCSymbolizer(getTheAArch64_32Target(),
+                                       createAArch64ExternalSymbolizer);
+
+  TargetRegistry::RegisterMCDisassembler(getTheARM64Target(),
+                                         createAArch64Disassembler);
+  TargetRegistry::RegisterMCSymbolizer(getTheARM64Target(),
+                                       createAArch64ExternalSymbolizer);
+  TargetRegistry::RegisterMCDisassembler(getTheARM64_32Target(),
+                                         createAArch64Disassembler);
+  TargetRegistry::RegisterMCSymbolizer(getTheARM64_32Target(),
+                                       createAArch64ExternalSymbolizer);
+}
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 0bceb322726d..5748556d0728 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -6608,45 +6608,6 @@ bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
   switch (IntrinID) {
   default:
     break;
-  case Intrinsic::aarch64_crypto_sha1h: {
-    Register DstReg = I.getOperand(0).getReg();
-    Register SrcReg = I.getOperand(2).getReg();
-
-    // FIXME: Should this be an assert?
-    if (MRI.getType(DstReg).getSizeInBits() != 32 ||
-        MRI.getType(SrcReg).getSizeInBits() != 32)
-      return false;
-
-    // The operation has to happen on FPRs. Set up some new FPR registers for
-    // the source and destination if they are on GPRs.
-    if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
-      SrcReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
-      MIB.buildCopy({SrcReg}, {I.getOperand(2)});
-
-      // Make sure the copy ends up getting constrained properly.
-      RBI.constrainGenericRegister(I.getOperand(2).getReg(),
-                                   AArch64::GPR32RegClass, MRI);
-    }
-
-    if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID)
-      DstReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
-
-    // Actually insert the instruction.
-    auto SHA1Inst = MIB.buildInstr(AArch64::SHA1Hrr, {DstReg}, {SrcReg});
-    constrainSelectedInstRegOperands(*SHA1Inst, TII, TRI, RBI);
-
-    // Did we create a new register for the destination?
-    if (DstReg != I.getOperand(0).getReg()) {
-      // Yep. Copy the result of the instruction back into the original
-      // destination.
-      MIB.buildCopy({I.getOperand(0)}, {DstReg});
-      RBI.constrainGenericRegister(I.getOperand(0).getReg(),
-                                   AArch64::GPR32RegClass, MRI);
-    }
-
-    I.eraseFromParent();
-    return true;
-  }
   case Intrinsic::ptrauth_resign: {
     Register DstReg = I.getOperand(0).getReg();
     Register ValReg = I.getOperand(2).getReg();
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 210643f6f2f4..ff09b375c310 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -222,7 +222,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .clampNumElements(0, v2s64, v2s64)
       .moreElementsToNextPow2(0)
       .minScalarSameAs(1, 0)
-      .scalarizeIf(scalarOrEltWiderThan(0, 64), 0);
+      .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
+      .minScalarEltSameAsIf(isVector(0), 1, 0)
+      .maxScalarEltSameAsIf(isVector(0), 1, 0);
 
   getActionDefinitionsBuilder(G_PTR_ADD)
       .legalFor({{p0, s64}, {v2p0, v2s64}})
@@ -879,8 +881,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
                  {v2s32, v2s32},
                  {v4s32, v4s32},
                  {v2s64, v2s64}})
-      .legalFor(HasFP16,
-                {{s32, s16}, {s64, s16}, {v4s16, v4s16}, {v8s16, v8s16}})
+      .legalFor(
+          HasFP16,
+          {{s16, s16}, {s32, s16}, {s64, s16}, {v4s16, v4s16}, {v8s16, v8s16}})
       // Handle types larger than i64 by scalarizing/lowering.
       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
       .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
@@ -1150,7 +1153,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .clampMaxNumElements(1, s32, 4)
       .clampMaxNumElements(1, s16, 8)
       .clampMaxNumElements(1, s8, 16)
-      .clampMaxNumElements(1, p0, 2);
+      .clampMaxNumElements(1, p0, 2)
+      .scalarizeIf(scalarOrEltWiderThan(1, 64), 1);
 
   getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT)
       .legalIf(
@@ -1165,7 +1169,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .clampNumElements(0, v4s16, v8s16)
       .clampNumElements(0, v2s32, v4s32)
       .clampMaxNumElements(0, s64, 2)
-      .clampMaxNumElements(0, p0, 2);
+      .clampMaxNumElements(0, p0, 2)
+      .scalarizeIf(scalarOrEltWiderThan(0, 64), 0);
 
   getActionDefinitionsBuilder(G_BUILD_VECTOR)
       .legalFor({{v8s8, s8},
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index 1b919abd222e..62de86bf87f5 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -482,6 +482,10 @@ static bool isFPIntrinsic(const MachineRegisterInfo &MRI,
   case Intrinsic::aarch64_neon_sqrdmulh:
   case Intrinsic::aarch64_neon_sqadd:
   case Intrinsic::aarch64_neon_sqsub:
+  case Intrinsic::aarch64_crypto_sha1h:
+  case Intrinsic::aarch64_crypto_sha1c:
+  case Intrinsic::aarch64_crypto_sha1p:
+  case Intrinsic::aarch64_crypto_sha1m:
     return true;
   case Intrinsic::aarch64_neon_saddlv: {
     const LLT SrcTy = MRI.getType(MI.getOperand(2).getReg());
@@ -848,10 +852,20 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR};
     break;
   }
+  case TargetOpcode::G_FPTOSI_SAT:
+  case TargetOpcode::G_FPTOUI_SAT: {
+    LLT DstType = MRI.getType(MI.getOperand(0).getReg());
+    if (DstType.isVector())
+      break;
+    if (DstType == LLT::scalar(16)) {
+      OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR};
+      break;
+    }
+    OpRegBankIdx = {PMI_FirstGPR, PMI_FirstFPR};
+    break;
+  }
   case TargetOpcode::G_FPTOSI:
   case TargetOpcode::G_FPTOUI:
-  case TargetOpcode::G_FPTOSI_SAT:
-  case TargetOpcode::G_FPTOUI_SAT:
   case TargetOpcode::G_INTRINSIC_LRINT:
   case TargetOpcode::G_INTRINSIC_LLRINT:
     if (MRI.getType(MI.getOperand(0).getReg()).isVector())
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
index 54b58e948daf..2552ee300933 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
@@ -365,13 +365,6 @@ void AArch64InstPrinter::printInst(const MCInst *MI, uint64_t Address,
     return;
   }
 
-  // Instruction TSB is specified as a one operand instruction, but 'csync' is
-  // not encoded, so for printing it is treated as a special case here:
-  if (Opcode == AArch64::TSB) {
-    O << "\ttsb\tcsync";
-    return;
-  }
-
   if (!PrintAliases || !printAliasInstr(MI, Address, STI, O))
     printInstruction(MI, Address, STI, O);
 
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 828c5c546240..2b5cf3484ffc 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -53,9 +53,9 @@ const MCAsmInfo::AtSpecifier MachOAtSpecifiers[] = {
     {AArch64::S_MACHO_TLVPPAGEOFF, "TLVPPAGEOFF"},
 };
 
-StringRef AArch64::getSpecifierName(const MCSpecifierExpr &Expr) {
+StringRef AArch64::getSpecifierName(AArch64::Specifier S) {
   // clang-format off
-  switch (static_cast<uint32_t>(Expr.getSpecifier())) {
+  switch (static_cast<uint32_t>(S)) {
   case AArch64::S_CALL:                return "";
   case AArch64::S_LO12:                return ":lo12:";
   case AArch64::S_ABS_G3:              return ":abs_g3:";
@@ -124,7 +124,7 @@ static bool evaluate(const MCSpecifierExpr &Expr, MCValue &Res,
   if (!Expr.getSubExpr()->evaluateAsRelocatable(Res, Asm))
     return false;
   Res.setSpecifier(Expr.getSpecifier());
-  return true;
+  return !Res.getSubSym();
 }
 
 AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin(bool IsILP32) {
@@ -183,7 +183,7 @@ void AArch64MCAsmInfoDarwin::printSpecifierExpr(
     raw_ostream &OS, const MCSpecifierExpr &Expr) const {
   if (auto *AE = dyn_cast<AArch64AuthMCExpr>(&Expr))
     return AE->print(OS, this);
-  OS << AArch64::getSpecifierName(Expr);
+  OS << AArch64::getSpecifierName(Expr.getSpecifier());
   printExpr(OS, *Expr.getSubExpr());
 }
 
@@ -232,7 +232,7 @@ void AArch64MCAsmInfoELF::printSpecifierExpr(
     raw_ostream &OS, const MCSpecifierExpr &Expr) const {
   if (auto *AE = dyn_cast<AArch64AuthMCExpr>(&Expr))
     return AE->print(OS, this);
-  OS << AArch64::getSpecifierName(Expr);
+  OS << AArch64::getSpecifierName(Expr.getSpecifier());
   printExpr(OS, *Expr.getSubExpr());
 }
 
@@ -262,7 +262,7 @@ AArch64MCAsmInfoMicrosoftCOFF::AArch64MCAsmInfoMicrosoftCOFF() {
 
 void AArch64MCAsmInfoMicrosoftCOFF::printSpecifierExpr(
     raw_ostream &OS, const MCSpecifierExpr &Expr) const {
-  OS << AArch64::getSpecifierName(Expr);
+  OS << AArch64::getSpecifierName(Expr.getSpecifier());
   printExpr(OS, *Expr.getSubExpr());
 }
 
@@ -292,7 +292,7 @@ AArch64MCAsmInfoGNUCOFF::AArch64MCAsmInfoGNUCOFF() {
 
 void AArch64MCAsmInfoGNUCOFF::printSpecifierExpr(
     raw_ostream &OS, const MCSpecifierExpr &Expr) const {
-  OS << AArch64::getSpecifierName(Expr);
+  OS << AArch64::getSpecifierName(Expr.getSpecifier());
   printExpr(OS, *Expr.getSubExpr());
 }
 
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
index c28e925d77e2..0dfa61b1dc60 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -181,7 +181,7 @@ enum {
 
 /// Return the string representation of the ELF relocation specifier
 /// (e.g. ":got:", ":lo12:").
-StringRef getSpecifierName(const MCSpecifierExpr &Expr);
+StringRef getSpecifierName(Specifier S);
 
 inline Specifier getSymbolLoc(Specifier S) {
   return static_cast<Specifier>(S & AArch64::S_SymLocBits);
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
index 91bdc880998b..7774d07a214b 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
@@ -65,15 +65,16 @@ void initLLVMToCVRegMapping(MCRegisterInfo *MRI);
 bool isHForm(const MCInst &MI, const MCInstrInfo *MCII);
 bool isQForm(const MCInst &MI, const MCInstrInfo *MCII);
 bool isFpOrNEON(const MCInst &MI, const MCInstrInfo *MCII);
-}
+} // namespace AArch64_MC
 
 namespace AArch64 {
 enum OperandType {
   OPERAND_IMPLICIT_IMM_0 = MCOI::OPERAND_FIRST_TARGET,
+  OPERAND_SHIFT_MSL,
 };
 } // namespace AArch64
 
-} // End llvm namespace
+} // namespace llvm
 
 // Defines symbolic names for AArch64 registers.  This defines a mapping from
 // register name to register number.
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
index a53b676142a0..5fe999389ce7 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
@@ -73,9 +73,10 @@ unsigned AArch64WinCOFFObjectWriter::getRelocType(
       // Supported
       break;
     default:
-      Ctx.reportError(Fixup.getLoc(), "relocation specifier " +
-                                          AArch64::getSpecifierName(*A64E) +
-                                          " unsupported on COFF targets");
+      Ctx.reportError(Fixup.getLoc(),
+                      "relocation specifier " +
+                          AArch64::getSpecifierName(A64E->getSpecifier()) +
+                          " unsupported on COFF targets");
       return COFF::IMAGE_REL_ARM64_ABSOLUTE; // Dummy return value
     }
   }
@@ -83,9 +84,10 @@ unsigned AArch64WinCOFFObjectWriter::getRelocType(
   switch (FixupKind) {
   default: {
     if (auto *A64E = dyn_cast<MCSpecifierExpr>(Expr)) {
-      Ctx.reportError(Fixup.getLoc(), "relocation specifier " +
-                                          AArch64::getSpecifierName(*A64E) +
-                                          " unsupported on COFF targets");
+      Ctx.reportError(Fixup.getLoc(),
+                      "relocation specifier " +
+                          AArch64::getSpecifierName(A64E->getSpecifier()) +
+                          " unsupported on COFF targets");
     } else {
       MCFixupKindInfo Info = MAB.getFixupKindInfo(Fixup.getKind());
       Ctx.reportError(Fixup.getLoc(), Twine("relocation type ") + Info.Name +
diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
index b58dfdf32e4a..c39a5cc2fcb1 100644
--- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // This pass implements the SME ABI requirements for ZA state. This includes
-// implementing the lazy ZA state save schemes around calls.
+// implementing the lazy (and agnostic) ZA state save schemes around calls.
 //
 //===----------------------------------------------------------------------===//
 //
@@ -139,8 +139,8 @@ StringRef getZAStateString(ZAState State) {
 #undef MAKE_CASE
 }
 
-static bool isZAorZT0RegOp(const TargetRegisterInfo &TRI,
-                           const MachineOperand &MO) {
+static bool isZAorZTRegOp(const TargetRegisterInfo &TRI,
+                          const MachineOperand &MO) {
   if (!MO.isReg() || !MO.getReg().isPhysical())
     return false;
   return any_of(TRI.subregs_inclusive(MO.getReg()), [](const MCPhysReg &SR) {
@@ -166,7 +166,7 @@ getZAStateBeforeInst(const TargetRegisterInfo &TRI, MachineInstr &MI,
     return {ZAOffAtReturn ? ZAState::OFF : ZAState::ACTIVE, InsertPt};
 
   for (auto &MO : MI.operands()) {
-    if (isZAorZT0RegOp(TRI, MO))
+    if (isZAorZTRegOp(TRI, MO))
       return {ZAState::ACTIVE, InsertPt};
   }
 
@@ -215,9 +215,44 @@ struct MachineSMEABI : public MachineFunctionPass {
   void emitZAOff(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                  bool ClearTPIDR2);
 
+  // Emission routines for agnostic ZA functions.
+  void emitSetupFullZASave(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           LiveRegs PhysLiveRegs);
+  // Emit a "full" ZA save or restore. It is "full" in the sense that this
+  // function will emit a call to __arm_sme_save or __arm_sme_restore, which
+  // handles saving and restoring both ZA and ZT0.
+  void emitFullZASaveRestore(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator MBBI,
+                             LiveRegs PhysLiveRegs, bool IsSave);
+  void emitAllocateFullZASaveBuffer(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MBBI,
+                                    LiveRegs PhysLiveRegs);
+
   void emitStateChange(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                        ZAState From, ZAState To, LiveRegs PhysLiveRegs);
 
+  // Helpers for switching between lazy/full ZA save/restore routines.
+  void emitZASave(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                  LiveRegs PhysLiveRegs) {
+    if (AFI->getSMEFnAttrs().hasAgnosticZAInterface())
+      return emitFullZASaveRestore(MBB, MBBI, PhysLiveRegs, /*IsSave=*/true);
+    return emitSetupLazySave(MBB, MBBI);
+  }
+  void emitZARestore(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                     LiveRegs PhysLiveRegs) {
+    if (AFI->getSMEFnAttrs().hasAgnosticZAInterface())
+      return emitFullZASaveRestore(MBB, MBBI, PhysLiveRegs, /*IsSave=*/false);
+    return emitRestoreLazySave(MBB, MBBI, PhysLiveRegs);
+  }
+  void emitAllocateZASaveBuffer(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MBBI,
+                                LiveRegs PhysLiveRegs) {
+    if (AFI->getSMEFnAttrs().hasAgnosticZAInterface())
+      return emitAllocateFullZASaveBuffer(MBB, MBBI, PhysLiveRegs);
+    return emitAllocateLazySaveBuffer(MBB, MBBI);
+  }
+
   /// Save live physical registers to virtual registers.
   PhysRegSave createPhysRegSave(LiveRegs PhysLiveRegs, MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator MBBI, DebugLoc DL);
@@ -228,6 +263,8 @@ struct MachineSMEABI : public MachineFunctionPass {
   /// Get or create a TPIDR2 block in this function.
   TPIDR2State getTPIDR2Block();
 
+  Register getAgnosticZABufferPtr();
+
 private:
   /// Contains the needed ZA state (and live registers) at an instruction.
   struct InstInfo {
@@ -241,6 +278,7 @@ private:
   struct BlockInfo {
     ZAState FixedEntryState{ZAState::ANY};
     SmallVector<InstInfo> Insts;
+    LiveRegs PhysLiveRegsAtEntry = LiveRegs::None;
     LiveRegs PhysLiveRegsAtExit = LiveRegs::None;
   };
 
@@ -249,24 +287,29 @@ private:
     SmallVector<BlockInfo> Blocks;
     SmallVector<ZAState> BundleStates;
     std::optional<TPIDR2State> TPIDR2Block;
+    std::optional<MachineBasicBlock::iterator> AfterSMEProloguePt;
+    Register AgnosticZABufferPtr = AArch64::NoRegister;
+    LiveRegs PhysLiveRegsAfterSMEPrologue = LiveRegs::None;
   } State;
 
   MachineFunction *MF = nullptr;
   EdgeBundles *Bundles = nullptr;
   const AArch64Subtarget *Subtarget = nullptr;
   const AArch64RegisterInfo *TRI = nullptr;
+  const AArch64FunctionInfo *AFI = nullptr;
   const TargetInstrInfo *TII = nullptr;
   MachineRegisterInfo *MRI = nullptr;
 };
 
 void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
-  assert((SMEFnAttrs.hasZT0State() || SMEFnAttrs.hasZAState()) &&
+  assert((SMEFnAttrs.hasAgnosticZAInterface() || SMEFnAttrs.hasZT0State() ||
+          SMEFnAttrs.hasZAState()) &&
          "Expected function to have ZA/ZT0 state!");
 
   State.Blocks.resize(MF->getNumBlockIDs());
   for (MachineBasicBlock &MBB : *MF) {
     BlockInfo &Block = State.Blocks[MBB.getNumber()];
-    if (&MBB == &MF->front()) {
+    if (MBB.isEntryBlock()) {
       // Entry block:
       Block.FixedEntryState = SMEFnAttrs.hasPrivateZAInterface()
                                   ? ZAState::CALLER_DORMANT
@@ -294,10 +337,20 @@ void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
 
     Block.PhysLiveRegsAtExit = GetPhysLiveRegs();
     auto FirstTerminatorInsertPt = MBB.getFirstTerminator();
+    auto FirstNonPhiInsertPt = MBB.getFirstNonPHI();
     for (MachineInstr &MI : reverse(MBB)) {
       MachineBasicBlock::iterator MBBI(MI);
       LiveUnits.stepBackward(MI);
       LiveRegs PhysLiveRegs = GetPhysLiveRegs();
+      // The SMEStateAllocPseudo marker is added to a function if the save
+      // buffer was allocated in SelectionDAG. It marks the end of the
+      // allocation -- which is a safe point for this pass to insert any TPIDR2
+      // block setup.
+      if (MI.getOpcode() == AArch64::SMEStateAllocPseudo) {
+        State.AfterSMEProloguePt = MBBI;
+        State.PhysLiveRegsAfterSMEPrologue = PhysLiveRegs;
+      }
+      // Note: We treat Agnostic ZA as inout_za with an alternate save/restore.
       auto [NeededState, InsertPt] = getZAStateBeforeInst(
           *TRI, MI, /*ZAOffAtReturn=*/SMEFnAttrs.hasPrivateZAInterface());
       assert((InsertPt == MBBI ||
@@ -306,6 +359,8 @@ void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
       // TODO: Do something to avoid state changes where NZCV is live.
       if (MBBI == FirstTerminatorInsertPt)
         Block.PhysLiveRegsAtExit = PhysLiveRegs;
+      if (MBBI == FirstNonPhiInsertPt)
+        Block.PhysLiveRegsAtEntry = PhysLiveRegs;
       if (NeededState != ZAState::ANY)
         Block.Insts.push_back({NeededState, InsertPt, PhysLiveRegs});
     }
@@ -529,23 +584,25 @@ void MachineSMEABI::emitZAOff(MachineBasicBlock &MBB,
 void MachineSMEABI::emitAllocateLazySaveBuffer(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) {
   MachineFrameInfo &MFI = MF->getFrameInfo();
-
   DebugLoc DL = getDebugLoc(MBB, MBBI);
   Register SP = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
   Register SVL = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
-  Register Buffer = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+  Register Buffer = AFI->getEarlyAllocSMESaveBuffer();
 
   // Calculate SVL.
   BuildMI(MBB, MBBI, DL, TII->get(AArch64::RDSVLI_XI), SVL).addImm(1);
 
   // 1. Allocate the lazy save buffer.
-  {
-    // TODO This function grows the stack with a subtraction, which doesn't work
-    // on Windows. Some refactoring to share the functionality in
-    // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
-    // supports SME
+  if (Buffer == AArch64::NoRegister) {
+    // TODO: On Windows, we allocate the lazy save buffer in SelectionDAG (so
+    // Buffer != AArch64::NoRegister). This is done to reuse the existing
+    // expansions (which can insert stack checks). This works, but it means we
+    // will always allocate the lazy save buffer (even if the function contains
+    // no lazy saves). If we want to handle Windows here, we'll need to
+    // implement something similar to LowerWindowsDYNAMIC_STACKALLOC.
     assert(!Subtarget->isTargetWindows() &&
            "Lazy ZA save is not yet supported on Windows");
+    Buffer = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
     // Get original stack pointer.
     BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), SP)
         .addReg(AArch64::SP);
@@ -590,8 +647,7 @@ void MachineSMEABI::emitNewZAPrologue(MachineBasicBlock &MBB,
       .addImm(AArch64SysReg::TPIDR2_EL0);
   // If TPIDR2_EL0 is non-zero, commit the lazy save.
   // NOTE: Functions that only use ZT0 don't need to zero ZA.
-  bool ZeroZA =
-      MF->getInfo<AArch64FunctionInfo>()->getSMEFnAttrs().hasZAState();
+  bool ZeroZA = AFI->getSMEFnAttrs().hasZAState();
   auto CommitZASave =
       BuildMI(MBB, MBBI, DL, TII->get(AArch64::CommitZASavePseudo))
           .addReg(TPIDR2EL0)
@@ -606,6 +662,86 @@ void MachineSMEABI::emitNewZAPrologue(MachineBasicBlock &MBB,
       .addImm(1);
 }
 
+Register MachineSMEABI::getAgnosticZABufferPtr() {
+  if (State.AgnosticZABufferPtr != AArch64::NoRegister)
+    return State.AgnosticZABufferPtr;
+  Register BufferPtr = AFI->getEarlyAllocSMESaveBuffer();
+  State.AgnosticZABufferPtr =
+      BufferPtr != AArch64::NoRegister
+          ? BufferPtr
+          : MF->getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+  return State.AgnosticZABufferPtr;
+}
+
+void MachineSMEABI::emitFullZASaveRestore(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator MBBI,
+                                          LiveRegs PhysLiveRegs, bool IsSave) {
+  auto *TLI = Subtarget->getTargetLowering();
+  DebugLoc DL = getDebugLoc(MBB, MBBI);
+  Register BufferPtr = AArch64::X0;
+
+  PhysRegSave RegSave = createPhysRegSave(PhysLiveRegs, MBB, MBBI, DL);
+
+  // Copy the buffer pointer into X0.
+  BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), BufferPtr)
+      .addReg(getAgnosticZABufferPtr());
+
+  // Call __arm_sme_save/__arm_sme_restore.
+  BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
+      .addReg(BufferPtr, RegState::Implicit)
+      .addExternalSymbol(TLI->getLibcallName(
+          IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE))
+      .addRegMask(TRI->getCallPreservedMask(
+          *MF,
+          CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1));
+
+  restorePhyRegSave(RegSave, MBB, MBBI, DL);
+}
+
+void MachineSMEABI::emitAllocateFullZASaveBuffer(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    LiveRegs PhysLiveRegs) {
+  // Buffer already allocated in SelectionDAG.
+  if (AFI->getEarlyAllocSMESaveBuffer())
+    return;
+
+  DebugLoc DL = getDebugLoc(MBB, MBBI);
+  Register BufferPtr = getAgnosticZABufferPtr();
+  Register BufferSize = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+
+  PhysRegSave RegSave = createPhysRegSave(PhysLiveRegs, MBB, MBBI, DL);
+
+  // Calculate the SME state size.
+  {
+    auto *TLI = Subtarget->getTargetLowering();
+    const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
+        .addExternalSymbol(TLI->getLibcallName(RTLIB::SMEABI_SME_STATE_SIZE))
+        .addReg(AArch64::X0, RegState::ImplicitDefine)
+        .addRegMask(TRI->getCallPreservedMask(
+            *MF, CallingConv::
+                     AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1));
+    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), BufferSize)
+        .addReg(AArch64::X0);
+  }
+
+  // Allocate a buffer object of the size given __arm_sme_state_size.
+  {
+    MachineFrameInfo &MFI = MF->getFrameInfo();
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP)
+        .addReg(AArch64::SP)
+        .addReg(BufferSize)
+        .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0));
+    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), BufferPtr)
+        .addReg(AArch64::SP);
+
+    // We have just allocated a variable sized object, tell this to PEI.
+    MFI.CreateVariableSizedObject(Align(16), nullptr);
+  }
+
+  restorePhyRegSave(RegSave, MBB, MBBI, DL);
+}
+
 void MachineSMEABI::emitStateChange(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator InsertPt,
                                     ZAState From, ZAState To,
@@ -623,10 +759,7 @@ void MachineSMEABI::emitStateChange(MachineBasicBlock &MBB,
   // TODO: Avoid setting up the save buffer if there's no transition to
   // LOCAL_SAVED.
   if (From == ZAState::CALLER_DORMANT) {
-    assert(MBB.getParent()
-               ->getInfo<AArch64FunctionInfo>()
-               ->getSMEFnAttrs()
-               .hasPrivateZAInterface() &&
+    assert(AFI->getSMEFnAttrs().hasPrivateZAInterface() &&
            "CALLER_DORMANT state requires private ZA interface");
     assert(&MBB == &MBB.getParent()->front() &&
            "CALLER_DORMANT state only valid in entry block");
@@ -641,12 +774,14 @@ void MachineSMEABI::emitStateChange(MachineBasicBlock &MBB,
   }
 
   if (From == ZAState::ACTIVE && To == ZAState::LOCAL_SAVED)
-    emitSetupLazySave(MBB, InsertPt);
+    emitZASave(MBB, InsertPt, PhysLiveRegs);
   else if (From == ZAState::LOCAL_SAVED && To == ZAState::ACTIVE)
-    emitRestoreLazySave(MBB, InsertPt, PhysLiveRegs);
+    emitZARestore(MBB, InsertPt, PhysLiveRegs);
   else if (To == ZAState::OFF) {
     assert(From != ZAState::CALLER_DORMANT &&
            "CALLER_DORMANT to OFF should have already been handled");
+    assert(!AFI->getSMEFnAttrs().hasAgnosticZAInterface() &&
+           "Should not turn ZA off in agnostic ZA function");
     emitZAOff(MBB, InsertPt, /*ClearTPIDR2=*/From == ZAState::LOCAL_SAVED);
   } else {
     dbgs() << "Error: Transition from " << getZAStateString(From) << " to "
@@ -664,9 +799,10 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
   if (!MF.getSubtarget<AArch64Subtarget>().hasSME())
     return false;
 
-  auto *AFI = MF.getInfo<AArch64FunctionInfo>();
+  AFI = MF.getInfo<AArch64FunctionInfo>();
   SMEAttrs SMEFnAttrs = AFI->getSMEFnAttrs();
-  if (!SMEFnAttrs.hasZAState() && !SMEFnAttrs.hasZT0State())
+  if (!SMEFnAttrs.hasZAState() && !SMEFnAttrs.hasZT0State() &&
+      !SMEFnAttrs.hasAgnosticZAInterface())
     return false;
 
   assert(MF.getRegInfo().isSSA() && "Expected to be run on SSA form!");
@@ -685,9 +821,19 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
   insertStateChanges();
 
   // Allocate save buffer (if needed).
-  if (State.TPIDR2Block) {
-    MachineBasicBlock &EntryBlock = MF.front();
-    emitAllocateLazySaveBuffer(EntryBlock, EntryBlock.getFirstNonPHI());
+  if (State.AgnosticZABufferPtr != AArch64::NoRegister || State.TPIDR2Block) {
+    if (State.AfterSMEProloguePt) {
+      // Note: With inline stack probes the AfterSMEProloguePt may not be in the
+      // entry block (due to the probing loop).
+      emitAllocateZASaveBuffer(*(*State.AfterSMEProloguePt)->getParent(),
+                               *State.AfterSMEProloguePt,
+                               State.PhysLiveRegsAfterSMEPrologue);
+    } else {
+      MachineBasicBlock &EntryBlock = MF.front();
+      emitAllocateZASaveBuffer(
+          EntryBlock, EntryBlock.getFirstNonPHI(),
+          State.Blocks[EntryBlock.getNumber()].PhysLiveRegsAtEntry);
+    }
   }
 
   return true;
diff --git a/llvm/lib/Target/AArch64/SMEABIPass.cpp b/llvm/lib/Target/AArch64/SMEABIPass.cpp
index 2008516885c3..79ceb2ababc7 100644
--- a/llvm/lib/Target/AArch64/SMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/SMEABIPass.cpp
@@ -50,8 +50,7 @@ private:
 
 char SMEABI::ID = 0;
 static const char *name = "SME ABI Pass";
-INITIALIZE_PASS_BEGIN(SMEABI, DEBUG_TYPE, name, false, false)
-INITIALIZE_PASS_END(SMEABI, DEBUG_TYPE, name, false, false)
+INITIALIZE_PASS(SMEABI, DEBUG_TYPE, name, false, false)
 
 FunctionPass *llvm::createSMEABIPass() { return new SMEABI(); }
 
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index b3005d512022..40ec371fe79d 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -1108,6 +1108,10 @@ class sme_spill_fill_base<bit isStore, dag outs, dag ins, string opcodestr>
     : I<outs, ins, opcodestr, "\t$ZAt[$Rv, $imm4], [$Rn, $offset, mul vl]", "",
         []>,
       Sched<[]> {
+  // 'offset' operand is encoded in the same bits as 'imm4'. There is currently
+  // no way to tell TableGen about this.
+  let DecoderMethod = "DecodeSMESpillFillInstruction";
+  bits<0> ZAt;
   bits<2> Rv;
   bits<5> Rn;
   bits<4> imm4;
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index a3a7d0f74e1b..f8c1fe81c678 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -315,10 +315,16 @@ def addsub_imm8_opt_lsl_i16 : imm8_opt_lsl<16, "uint16_t", SVEAddSubImmOperand16
 def addsub_imm8_opt_lsl_i32 : imm8_opt_lsl<32, "uint32_t", SVEAddSubImmOperand32>;
 def addsub_imm8_opt_lsl_i64 : imm8_opt_lsl<64, "uint64_t", SVEAddSubImmOperand64>;
 
-def SVEAddSubImm8Pat  : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i8>", []>;
-def SVEAddSubImm16Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i16>", []>;
-def SVEAddSubImm32Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i32>", []>;
-def SVEAddSubImm64Pat : ComplexPattern<i64, 2, "SelectSVEAddSubImm<MVT::i64>", []>;
+let Complexity = 1 in {
+def SVEAddSubImm8Pat  : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i8, false>", []>;
+def SVEAddSubImm16Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i16, false>", []>;
+def SVEAddSubImm32Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i32, false>", []>;
+def SVEAddSubImm64Pat : ComplexPattern<i64, 2, "SelectSVEAddSubImm<MVT::i64, false>", []>;
+
+def SVEAddSubNegImm8Pat  : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i8, true>", []>;
+def SVEAddSubNegImm16Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i16, true>", []>;
+def SVEAddSubNegImm32Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i32, true>", []>;
+def SVEAddSubNegImm64Pat : ComplexPattern<i64, 2, "SelectSVEAddSubImm<MVT::i64, true>", []>;
 
 def SVEAddSubSSatNegImm8Pat  : ComplexPattern<i32, 2, "SelectSVEAddSubSSatImm<MVT::i8, true>", []>;
 def SVEAddSubSSatNegImm16Pat : ComplexPattern<i32, 2, "SelectSVEAddSubSSatImm<MVT::i16, true>", []>;
@@ -329,6 +335,7 @@ def SVEAddSubSSatPosImm8Pat  : ComplexPattern<i32, 2, "SelectSVEAddSubSSatImm<MV
 def SVEAddSubSSatPosImm16Pat : ComplexPattern<i32, 2, "SelectSVEAddSubSSatImm<MVT::i16, false>", []>;
 def SVEAddSubSSatPosImm32Pat : ComplexPattern<i32, 2, "SelectSVEAddSubSSatImm<MVT::i32, false>", []>;
 def SVEAddSubSSatPosImm64Pat : ComplexPattern<i64, 2, "SelectSVEAddSubSSatImm<MVT::i64, false>", []>;
+} // Complexity = 1
 
 def SVECpyDupImm8Pat  : ComplexPattern<i32, 2, "SelectSVECpyDupImm<MVT::i8>", []>;
 def SVECpyDupImm16Pat : ComplexPattern<i32, 2, "SelectSVECpyDupImm<MVT::i16>", []>;
@@ -886,13 +893,17 @@ class sve_int_ptest<bits<6> opc, string asm, SDPatternOperator op>
 }
 
 multiclass sve_int_ptest<bits<6> opc, string asm, SDPatternOperator op,
-                         SDPatternOperator op_any> {
+                         SDPatternOperator op_any, SDPatternOperator op_first> {
   def NAME : sve_int_ptest<opc, asm, op>;
 
   let hasNoSchedulingInfo = 1, isCompare = 1, Defs = [NZCV] in {
   def _ANY : Pseudo<(outs), (ins PPRAny:$Pg, PPR8:$Pn),
                     [(set NZCV, (op_any (nxv16i1 PPRAny:$Pg), (nxv16i1 PPR8:$Pn)))]>,
              PseudoInstExpansion<(!cast<Instruction>(NAME) PPRAny:$Pg, PPR8:$Pn)>;
+
+  def _FIRST : Pseudo<(outs), (ins PPRAny:$Pg, PPR8:$Pn),
+                    [(set NZCV, (op_first (nxv16i1 PPRAny:$Pg), (nxv16i1 PPR8:$Pn)))]>,
+             PseudoInstExpansion<(!cast<Instruction>(NAME) PPRAny:$Pg, PPR8:$Pn)>;
   }
 }
 
@@ -5154,11 +5165,14 @@ multiclass sve_int_dup_imm<string asm> {
                   (!cast<Instruction>(NAME # _D) ZPR64:$Zd, cpy_imm8_opt_lsl_i64:$imm), 1>;
 
   def : InstAlias<"fmov $Zd, #0.0",
-                  (!cast<Instruction>(NAME # _H) ZPR16:$Zd, 0, 0), 1>;
+                  (!cast<Instruction>(NAME # _H) ZPR16:$Zd,
+                       (cpy_imm8_opt_lsl_i16 0, 0)), 1>;
   def : InstAlias<"fmov $Zd, #0.0",
-                  (!cast<Instruction>(NAME # _S) ZPR32:$Zd, 0, 0), 1>;
+                  (!cast<Instruction>(NAME # _S) ZPR32:$Zd,
+                       (cpy_imm8_opt_lsl_i32 0, 0)), 1>;
   def : InstAlias<"fmov $Zd, #0.0",
-                  (!cast<Instruction>(NAME # _D) ZPR64:$Zd, 0, 0), 1>;
+                  (!cast<Instruction>(NAME # _D) ZPR64:$Zd,
+                       (cpy_imm8_opt_lsl_i64 0, 0)), 1>;
 }
 
 class sve_int_dup_fpimm<bits<2> sz8_64, Operand fpimmtype,
@@ -5218,7 +5232,8 @@ class sve_int_arith_imm0<bits<2> sz8_64, bits<3> opc, string asm,
   let hasSideEffects = 0;
 }
 
-multiclass sve_int_arith_imm0<bits<3> opc, string asm, SDPatternOperator op> {
+multiclass sve_int_arith_imm0<bits<3> opc, string asm, SDPatternOperator op,
+                              SDPatternOperator inv_op = null_frag> {
   def _B : sve_int_arith_imm0<0b00, opc, asm, ZPR8,  addsub_imm8_opt_lsl_i8>;
   def _H : sve_int_arith_imm0<0b01, opc, asm, ZPR16, addsub_imm8_opt_lsl_i16>;
   def _S : sve_int_arith_imm0<0b10, opc, asm, ZPR32, addsub_imm8_opt_lsl_i32>;
@@ -5228,6 +5243,12 @@ multiclass sve_int_arith_imm0<bits<3> opc, string asm, SDPatternOperator op> {
   def : SVE_1_Op_Imm_OptLsl_Pat<nxv8i16, op, ZPR16, i32, SVEAddSubImm16Pat, !cast<Instruction>(NAME # _H)>;
   def : SVE_1_Op_Imm_OptLsl_Pat<nxv4i32, op, ZPR32, i32, SVEAddSubImm32Pat, !cast<Instruction>(NAME # _S)>;
   def : SVE_1_Op_Imm_OptLsl_Pat<nxv2i64, op, ZPR64, i64, SVEAddSubImm64Pat, !cast<Instruction>(NAME # _D)>;
+
+  // Extra patterns for add(x, splat(-ve)) -> sub(x, +ve). There is no i8
+  // pattern as all i8 constants can be handled by an add.
+  def : SVE_1_Op_Imm_OptLsl_Pat<nxv8i16, inv_op, ZPR16, i32, SVEAddSubNegImm16Pat, !cast<Instruction>(NAME # _H)>;
+  def : SVE_1_Op_Imm_OptLsl_Pat<nxv4i32, inv_op, ZPR32, i32, SVEAddSubNegImm32Pat, !cast<Instruction>(NAME # _S)>;
+  def : SVE_1_Op_Imm_OptLsl_Pat<nxv2i64, inv_op, ZPR64, i64, SVEAddSubNegImm64Pat, !cast<Instruction>(NAME # _D)>;
 }
 
 multiclass sve_int_arith_imm0_ssat<bits<3> opc, string asm, SDPatternOperator op,
@@ -5549,11 +5570,14 @@ multiclass sve_int_dup_imm_pred_merge<string asm, SDPatternOperator op> {
                                             nxv2i64, nxv2i1, i64, SVECpyDupImm64Pat>;
 
   def : InstAlias<"fmov $Zd, $Pg/m, #0.0",
-                  (!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, 0, 0), 0>;
+                  (!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg,
+                       (cpy_imm8_opt_lsl_i16 0, 0)), 0>;
   def : InstAlias<"fmov $Zd, $Pg/m, #0.0",
-                  (!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, 0, 0), 0>;
+                  (!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg,
+                       (cpy_imm8_opt_lsl_i32 0, 0)), 0>;
   def : InstAlias<"fmov $Zd, $Pg/m, #0.0",
-                  (!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, 0, 0), 0>;
+                  (!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg,
+                       (cpy_imm8_opt_lsl_i64 0, 0)), 0>;
 
   def : Pat<(vselect PPRAny:$Pg, (SVEDup0), (nxv8f16 ZPR:$Zd)),
             (!cast<Instruction>(NAME # _H) $Zd, $Pg, 0, 0)>;
@@ -5946,16 +5970,20 @@ class sve2_int_while_rr<bits<2> sz8_64, bits<1> rw, string asm,
   let isWhile = 1;
 }
 
-multiclass sve2_int_while_rr<bits<1> rw, string asm, string op> {
+multiclass sve2_int_while_rr<bits<1> rw, string asm, SDPatternOperator op> {
   def _B : sve2_int_while_rr<0b00, rw, asm, PPR8>;
   def _H : sve2_int_while_rr<0b01, rw, asm, PPR16>;
   def _S : sve2_int_while_rr<0b10, rw, asm, PPR32>;
   def _D : sve2_int_while_rr<0b11, rw, asm, PPR64>;
 
-  def : SVE_2_Op_Pat<nxv16i1, !cast<SDPatternOperator>(op # _b), i64, i64, !cast<Instruction>(NAME # _B)>;
-  def : SVE_2_Op_Pat<nxv8i1,  !cast<SDPatternOperator>(op # _h), i64, i64, !cast<Instruction>(NAME # _H)>;
-  def : SVE_2_Op_Pat<nxv4i1,  !cast<SDPatternOperator>(op # _s), i64, i64, !cast<Instruction>(NAME # _S)>;
-  def : SVE_2_Op_Pat<nxv2i1,  !cast<SDPatternOperator>(op # _d), i64, i64, !cast<Instruction>(NAME # _D)>;
+  def : Pat<(nxv16i1 (op i64:$Op1, i64:$Op2, (i64 1))),
+            (!cast<Instruction>(NAME # _B) $Op1, $Op2)>;
+  def : Pat<(nxv8i1 (op i64:$Op1, i64:$Op2, (i64 2))),
+            (!cast<Instruction>(NAME # _H) $Op1, $Op2)>;
+  def : Pat<(nxv4i1 (op i64:$Op1, i64:$Op2, (i64 4))),
+            (!cast<Instruction>(NAME # _S) $Op1, $Op2)>;
+  def : Pat<(nxv2i1 (op i64:$Op1, i64:$Op2, (i64 8))),
+            (!cast<Instruction>(NAME # _D) $Op1, $Op2)>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 0059a862ba9b..0f2c33585884 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -62,6 +62,7 @@ FunctionPass *createAMDGPURewriteOutArgumentsPass();
 ModulePass *
 createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM = nullptr);
 ModulePass *createAMDGPULowerBufferFatPointersPass();
+ModulePass *createAMDGPULowerIntrinsicsLegacyPass();
 FunctionPass *createSIModeRegisterPass();
 FunctionPass *createGCNPreRAOptimizationsLegacyPass();
 FunctionPass *createAMDGPUPreloadKernArgPrologLegacyPass();
@@ -153,6 +154,16 @@ private:
   const TargetMachine &TM;
 };
 
+void initializeAMDGPULowerIntrinsicsLegacyPass(PassRegistry &);
+
+struct AMDGPULowerIntrinsicsPass : PassInfoMixin<AMDGPULowerIntrinsicsPass> {
+  AMDGPULowerIntrinsicsPass(const AMDGPUTargetMachine &TM) : TM(TM) {}
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
+
+private:
+  const AMDGPUTargetMachine &TM;
+};
+
 void initializeAMDGPUPrepareAGPRAllocLegacyPass(PassRegistry &);
 extern char &AMDGPUPrepareAGPRAllocLegacyID;
 
@@ -490,6 +501,9 @@ extern char &SIModeRegisterID;
 void initializeAMDGPUInsertDelayAluLegacyPass(PassRegistry &);
 extern char &AMDGPUInsertDelayAluID;
 
+void initializeAMDGPULowerVGPREncodingLegacyPass(PassRegistry &);
+extern char &AMDGPULowerVGPREncodingLegacyID;
+
 void initializeSIInsertHardClausesLegacyPass(PassRegistry &);
 extern char &SIInsertHardClausesID;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 8e4b6365dc06..ffbda14dcd84 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -68,13 +68,15 @@ def FeatureFlatInstOffsets : SubtargetFeature<"flat-inst-offsets",
 def FeatureFlatGlobalInsts : SubtargetFeature<"flat-global-insts",
   "FlatGlobalInsts",
   "true",
-  "Have global_* flat memory instructions"
+  "Have global_* flat memory instructions",
+  [FeatureFlatAddressSpace]
 >;
 
 def FeatureFlatScratchInsts : SubtargetFeature<"flat-scratch-insts",
   "FlatScratchInsts",
   "true",
-  "Have scratch_* flat memory instructions"
+  "Have scratch_* flat memory instructions",
+  [FeatureFlatAddressSpace]
 >;
 
 def FeatureScalarFlatScratchInsts : SubtargetFeature<"scalar-flat-scratch-insts",
@@ -92,7 +94,8 @@ def FeatureEnableFlatScratch : SubtargetFeature<"enable-flat-scratch",
 def FeatureFlatGVSMode : SubtargetFeature<"flat-gvs-mode",
   "FlatGVSMode",
   "true",
-  "Have GVS addressing mode with flat_* instructions"
+  "Have GVS addressing mode with flat_* instructions",
+  [FeatureFlatAddressSpace]
 >;
 
 def FeatureAddNoCarryInsts : SubtargetFeature<"add-no-carry-insts",
@@ -286,12 +289,6 @@ def FeatureSafeCUPrefetch : SubtargetFeature<"safe-cu-prefetch",
   "VMEM CU scope prefetches do not fail on illegal address"
 >;
 
-def FeatureCUStores : SubtargetFeature<"cu-stores",
-  "HasCUStores",
-  "true",
-  "Whether SCOPE_CU stores can be used on GFX12.5"
->;
-
 def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard",
   "HasVcmpxExecWARHazard",
   "true",
@@ -419,6 +416,12 @@ def FeatureGFX9Insts : SubtargetFeature<"gfx9-insts",
   "Additional instructions for GFX9+"
 >;
 
+def FeatureRequiresAlignedVGPRs : SubtargetFeature<"vgpr-align2",
+  "RequiresAlignVGPR",
+  "true",
+  "VGPR and AGPR tuple operands require even alignment"
+>;
+
 def FeatureGFX90AInsts : SubtargetFeature<"gfx90a-insts",
   "GFX90AInsts",
   "true",
@@ -928,13 +931,15 @@ def FeatureAtomicFMinFMaxF64GlobalInsts : SubtargetFeature<"atomic-fmin-fmax-glo
 def FeatureAtomicFMinFMaxF32FlatInsts : SubtargetFeature<"atomic-fmin-fmax-flat-f32",
   "HasAtomicFMinFMaxF32FlatInsts",
   "true",
-  "Has flat memory instructions for atomicrmw fmin/fmax for float"
+  "Has flat memory instructions for atomicrmw fmin/fmax for float",
+  [FeatureFlatAddressSpace]
 >;
 
 def FeatureAtomicFMinFMaxF64FlatInsts : SubtargetFeature<"atomic-fmin-fmax-flat-f64",
   "HasAtomicFMinFMaxF64FlatInsts",
   "true",
-  "Has flat memory instructions for atomicrmw fmin/fmax for double"
+  "Has flat memory instructions for atomicrmw fmin/fmax for double",
+  [FeatureFlatAddressSpace]
 >;
 
 def FeatureAtomicFaddNoRtnInsts : SubtargetFeature<"atomic-fadd-no-rtn-insts",
@@ -986,7 +991,8 @@ def FeatureFlatAtomicFaddF32Inst
   : SubtargetFeature<"flat-atomic-fadd-f32-inst",
   "HasFlatAtomicFaddF32Inst",
   "true",
-  "Has flat_atomic_add_f32 instruction"
+  "Has flat_atomic_add_f32 instruction",
+  [FeatureFlatAddressSpace]
 >;
 
 def FeatureFlatBufferGlobalAtomicFaddF64Inst
@@ -1204,6 +1210,12 @@ def Feature64BitLiterals : SubtargetFeature<"64-bit-literals",
   "Can use 64-bit literals with single DWORD instructions"
 >;
 
+def Feature1024AddressableVGPRs : SubtargetFeature<"1024-addressable-vgprs",
+  "Has1024AddressableVGPRs",
+  "true",
+  "Has 1024 addressable VGPRs"
+>;
+
 def FeatureWaitXcnt : SubtargetFeature<"wait-xcnt",
   "HasWaitXcnt",
   "true",
@@ -1721,6 +1733,7 @@ def FeatureISAVersion9_0_9 : FeatureSet<
 def FeatureISAVersion9_0_A : FeatureSet<
   !listconcat(FeatureISAVersion9_0_MI_Common.Features,
     [FeatureGFX90AInsts,
+     FeatureRequiresAlignedVGPRs,
      FeatureFmacF64Inst,
      FeatureDPALU_DPP,
      FeaturePackedFP32Ops,
@@ -1743,6 +1756,7 @@ def FeatureISAVersion9_4_Common : FeatureSet<
   [FeatureGFX9,
    FeatureGFX90AInsts,
    FeatureGFX940Insts,
+   FeatureRequiresAlignedVGPRs,
    FeatureFmaMixInsts,
    FeatureLDSBankCount32,
    FeatureDLInsts,
@@ -1894,6 +1908,7 @@ def FeatureISAVersion10_3_Generic: FeatureSet<
 
 def FeatureISAVersion11_Common : FeatureSet<
   [FeatureGFX11,
+   FeatureBackOffBarrier,
    FeatureLDSBankCount32,
    FeatureDLInsts,
    FeatureDot5Insts,
@@ -1977,6 +1992,7 @@ def FeatureISAVersion11_5_3 : FeatureSet<
 
 def FeatureISAVersion12 : FeatureSet<
   [FeatureGFX12,
+   FeatureBackOffBarrier,
    FeatureAddressableLocalMemorySize65536,
    FeatureLDSBankCount32,
    FeatureDLInsts,
@@ -2019,9 +2035,10 @@ def FeatureISAVersion12 : FeatureSet<
 def FeatureISAVersion12_50 : FeatureSet<
   [FeatureGFX12,
    FeatureGFX1250Insts,
-   FeatureCUStores,
+   FeatureRequiresAlignedVGPRs,
    FeatureAddressableLocalMemorySize327680,
    FeatureCuMode,
+   Feature1024AddressableVGPRs,
    Feature64BitLiterals,
    FeatureLDSBankCount32,
    FeatureDLInsts,
@@ -2830,6 +2847,9 @@ def HasBVHDualAndBVH8Insts : Predicate<"Subtarget->hasBVHDualAndBVH8Insts()">,
 def Has64BitLiterals : Predicate<"Subtarget->has64BitLiterals()">,
   AssemblerPredicate<(all_of Feature64BitLiterals)>;
 
+def Has1024AddressableVGPRs : Predicate<"Subtarget->has1024AddressableVGPRs()">,
+  AssemblerPredicate<(all_of Feature1024AddressableVGPRs)>;
+
 def HasWaitXcnt : Predicate<"Subtarget->hasWaitXcnt()">,
   AssemblerPredicate<(all_of FeatureWaitXcnt)>;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 36c0d1cbcea2..29f8f9bc8b54 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -557,7 +557,6 @@ const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
   MCContext &Ctx = MF.getContext();
   uint16_t KernelCodeProperties = 0;
   const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
-  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
 
   if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
     KernelCodeProperties |=
@@ -587,13 +586,10 @@ const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
     KernelCodeProperties |=
         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
   }
-  if (ST.isWave32()) {
+  if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
     KernelCodeProperties |=
         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
   }
-  if (isGFX1250(ST) && ST.hasCUStores()) {
-    KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES;
-  }
 
   // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be
   // un-evaluatable at this point so it cannot be conditionally checked here.
@@ -638,7 +634,7 @@ AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
   (void)PGRM_Rsrc3;
   (void)EvaluatableRsrc3;
   assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 ||
-         STM.hasGFX90AInsts() || !EvaluatableRsrc3 ||
+         STM.hasGFX90AInsts() || AMDGPU::isGFX1250(STM) || !EvaluatableRsrc3 ||
          static_cast<uint64_t>(PGRM_Rsrc3) == 0);
   KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3;
 
@@ -845,7 +841,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 
     [[maybe_unused]] int64_t PGMRSrc3;
     assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 ||
-           STM.hasGFX90AInsts() ||
+           STM.hasGFX90AInsts() || AMDGPU::isGFX1250(STM) ||
            (CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGMRSrc3) &&
             static_cast<uint64_t>(PGMRSrc3) == 0));
     if (STM.hasGFX90AInsts()) {
@@ -1143,9 +1139,13 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
     const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx);
     return SubGPR;
   };
-
-  ProgInfo.SGPRBlocks = GetNumGPRBlocks(ProgInfo.NumSGPRsForWavesPerEU,
-                                        IsaInfo::getSGPREncodingGranule(&STM));
+  // GFX10+ will always allocate 128 SGPRs and this field must be 0
+  if (STM.getGeneration() >= AMDGPUSubtarget::GFX10) {
+    ProgInfo.SGPRBlocks = CreateExpr(0ul);
+  } else {
+    ProgInfo.SGPRBlocks = GetNumGPRBlocks(
+        ProgInfo.NumSGPRsForWavesPerEU, IsaInfo::getSGPREncodingGranule(&STM));
+  }
   ProgInfo.VGPRBlocks = GetNumGPRBlocks(ProgInfo.NumVGPRsForWavesPerEU,
                                         IsaInfo::getVGPREncodingGranule(&STM));
 
@@ -1440,9 +1440,10 @@ static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD,
       MD->setComputeRegisters(".dynamic_vgpr_en", true);
   }
 
-  MD->setHwStage(CC, ".lds_size",
-                 (unsigned)(CurrentProgramInfo.LdsSize *
-                            getLdsDwGranularity(ST) * sizeof(uint32_t)));
+  MD->updateHwStageMaximum(
+      CC, ".lds_size",
+      (unsigned)(CurrentProgramInfo.LdsSize * getLdsDwGranularity(ST) *
+                 sizeof(uint32_t)));
 }
 
 // This is the equivalent of EmitProgramInfoSI above, but for when the OS type
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 59cc1df292f4..f646457f9d76 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -1296,74 +1296,6 @@ struct AAAMDGPUNoAGPR
 
 const char AAAMDGPUNoAGPR::ID = 0;
 
-/// Performs the final check and updates the 'amdgpu-waves-per-eu' attribute
-/// based on the finalized 'amdgpu-flat-work-group-size' attribute.
-/// Both attributes start with narrow ranges that expand during iteration.
-/// However, a narrower flat-workgroup-size leads to a wider waves-per-eu range,
-/// preventing optimal updates later. Therefore, waves-per-eu can't be updated
-/// with intermediate values during the attributor run. We defer the
-/// finalization of waves-per-eu until after the flat-workgroup-size is
-/// finalized.
-/// TODO: Remove this and move similar logic back into the attributor run once
-/// we have a better representation for waves-per-eu.
-static bool updateWavesPerEU(Module &M, TargetMachine &TM) {
-  bool Changed = false;
-
-  LLVMContext &Ctx = M.getContext();
-
-  for (Function &F : M) {
-    if (F.isDeclaration())
-      continue;
-
-    const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
-
-    std::optional<std::pair<unsigned, std::optional<unsigned>>>
-        FlatWgrpSizeAttr =
-            AMDGPU::getIntegerPairAttribute(F, "amdgpu-flat-work-group-size");
-
-    unsigned MinWavesPerEU = ST.getMinWavesPerEU();
-    unsigned MaxWavesPerEU = ST.getMaxWavesPerEU();
-
-    unsigned MinFlatWgrpSize = ST.getMinFlatWorkGroupSize();
-    unsigned MaxFlatWgrpSize = ST.getMaxFlatWorkGroupSize();
-    if (FlatWgrpSizeAttr.has_value()) {
-      MinFlatWgrpSize = FlatWgrpSizeAttr->first;
-      MaxFlatWgrpSize = *(FlatWgrpSizeAttr->second);
-    }
-
-    // Start with the "best" range.
-    unsigned Min = MinWavesPerEU;
-    unsigned Max = MinWavesPerEU;
-
-    // Compute the range from flat workgroup size. `getWavesPerEU` will also
-    // account for the 'amdgpu-waves-er-eu' attribute.
-    auto [MinFromFlatWgrpSize, MaxFromFlatWgrpSize] =
-        ST.getWavesPerEU(F, {MinFlatWgrpSize, MaxFlatWgrpSize});
-
-    // For the lower bound, we have to "tighten" it.
-    Min = std::max(Min, MinFromFlatWgrpSize);
-    // For the upper bound, we have to "extend" it.
-    Max = std::max(Max, MaxFromFlatWgrpSize);
-
-    // Clamp the range to the max range.
-    Min = std::max(Min, MinWavesPerEU);
-    Max = std::min(Max, MaxWavesPerEU);
-
-    // Update the attribute if it is not the max.
-    if (Min != MinWavesPerEU || Max != MaxWavesPerEU) {
-      SmallString<10> Buffer;
-      raw_svector_ostream OS(Buffer);
-      OS << Min << ',' << Max;
-      Attribute OldAttr = F.getFnAttribute("amdgpu-waves-per-eu");
-      Attribute NewAttr = Attribute::get(Ctx, "amdgpu-waves-per-eu", OS.str());
-      F.addFnAttr(NewAttr);
-      Changed |= OldAttr == NewAttr;
-    }
-  }
-
-  return Changed;
-}
-
 static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
                     AMDGPUAttributorOptions Options,
                     ThinOrFullLTOPhase LTOPhase) {
@@ -1438,11 +1370,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
     }
   }
 
-  bool Changed = A.run() == ChangeStatus::CHANGED;
-
-  Changed |= updateWavesPerEU(M, TM);
-
-  return Changed;
+  return A.run() == ChangeStatus::CHANGED;
 }
 } // namespace
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index d1a5b4e85da4..21255f691e4a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -1004,8 +1004,14 @@ static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
     return IsWave32 ? AMDGPU::SI_CS_CHAIN_TC_W32 : AMDGPU::SI_CS_CHAIN_TC_W64;
   }
 
-  return CC == CallingConv::AMDGPU_Gfx ? AMDGPU::SI_TCRETURN_GFX :
-                                         AMDGPU::SI_TCRETURN;
+  if (CallerF.getFunction().getCallingConv() ==
+      CallingConv::AMDGPU_Gfx_WholeWave)
+    return AMDGPU::SI_TCRETURN_GFX_WholeWave;
+
+  if (CC == CallingConv::AMDGPU_Gfx || CC == CallingConv::AMDGPU_Gfx_WholeWave)
+    return AMDGPU::SI_TCRETURN_GFX;
+
+  return AMDGPU::SI_TCRETURN;
 }
 
 // Add operands to call instruction to track the callee.
@@ -1284,6 +1290,13 @@ bool AMDGPUCallLowering::lowerTailCall(
   unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), /*IsTailCall*/ true,
                                ST.isWave32(), CalleeCC, IsDynamicVGPRChainCall);
   auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
+
+  if (FuncInfo->isWholeWaveFunction())
+    addOriginalExecToReturn(MF, MIB);
+
+  // Keep track of the index of the next operand to be added to the call
+  unsigned CalleeIdx = MIB->getNumOperands();
+
   if (!addCallTargetOperands(MIB, MIRBuilder, Info, IsDynamicVGPRChainCall))
     return false;
 
@@ -1401,7 +1414,7 @@ bool AMDGPUCallLowering::lowerTailCall(
   // If we have -tailcallopt, we need to adjust the stack. We'll do the call
   // sequence start and end here.
   if (!IsSibCall) {
-    MIB->getOperand(1).setImm(FPDiff);
+    MIB->getOperand(CalleeIdx + 1).setImm(FPDiff);
     CallSeqStart.addImm(NumBytes).addImm(0);
     // End the call sequence *before* emitting the call. Normally, we would
     // tidy the frame up after the call. However, here, we've laid out the
@@ -1413,16 +1426,24 @@ bool AMDGPUCallLowering::lowerTailCall(
   // Now we can add the actual call instruction to the correct basic block.
   MIRBuilder.insertInstr(MIB);
 
+  // If this is a whole wave tail call, we need to constrain the register for
+  // the original EXEC.
+  if (MIB->getOpcode() == AMDGPU::SI_TCRETURN_GFX_WholeWave) {
+    MIB->getOperand(0).setReg(
+        constrainOperandRegClass(MF, *TRI, MRI, *TII, *ST.getRegBankInfo(),
+                                 *MIB, MIB->getDesc(), MIB->getOperand(0), 0));
+  }
+
   // If Callee is a reg, since it is used by a target specific
   // instruction, it must have a register class matching the
   // constraint of that instruction.
 
   // FIXME: We should define regbankselectable call instructions to handle
   // divergent call targets.
-  if (MIB->getOperand(0).isReg()) {
-    MIB->getOperand(0).setReg(
-        constrainOperandRegClass(MF, *TRI, MRI, *TII, *ST.getRegBankInfo(),
-                                 *MIB, MIB->getDesc(), MIB->getOperand(0), 0));
+  if (MIB->getOperand(CalleeIdx).isReg()) {
+    MIB->getOperand(CalleeIdx).setReg(constrainOperandRegClass(
+        MF, *TRI, MRI, *TII, *ST.getRegBankInfo(), *MIB, MIB->getDesc(),
+        MIB->getOperand(CalleeIdx), CalleeIdx));
   }
 
   MF.getFrameInfo().setHasTailCall();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 394a143dd308..0c112d1787c1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -128,12 +128,18 @@ def gi_global_saddr :
 def gi_global_saddr_cpol :
     GIComplexOperandMatcher<s64, "selectGlobalSAddrCPol">,
     GIComplexPatternEquiv<GlobalSAddrCPol>;
+def gi_global_saddr_cpol_m0 :
+    GIComplexOperandMatcher<s64, "selectGlobalSAddrCPolM0">,
+    GIComplexPatternEquiv<GlobalSAddrCPolM0>;
 def gi_global_saddr_glc :
     GIComplexOperandMatcher<s64, "selectGlobalSAddrGLC">,
     GIComplexPatternEquiv<GlobalSAddrGLC>;
 def gi_global_saddr_no_ioffset :
     GIComplexOperandMatcher<s64, "selectGlobalSAddrNoIOffset">,
     GIComplexPatternEquiv<GlobalSAddrNoIOffset>;
+def gi_global_saddr_no_ioffset_m0 :
+    GIComplexOperandMatcher<s64, "selectGlobalSAddrNoIOffsetM0">,
+    GIComplexPatternEquiv<GlobalSAddrNoIOffsetM0>;
 
 def gi_mubuf_scratch_offset :
     GIComplexOperandMatcher<s32, "selectMUBUFScratchOffset">,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index b80e43b27129..3785d0f7f268 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2089,6 +2089,23 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPol(SDNode *N, SDValue Addr,
   return true;
 }
 
+bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPolM0(SDNode *N, SDValue Addr,
+                                                 SDValue &SAddr,
+                                                 SDValue &VOffset,
+                                                 SDValue &Offset,
+                                                 SDValue &CPol) const {
+  bool ScaleOffset;
+  if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
+    return false;
+
+  // We are assuming CPol is second from last operand of the intrinsic.
+  auto PassedCPol =
+      N->getConstantOperandVal(N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL;
+  CPol = CurDAG->getTargetConstant(
+      (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
+  return true;
+}
+
 bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
                                               SDValue &SAddr, SDValue &VOffset,
                                               SDValue &Offset,
@@ -2120,6 +2137,24 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr,
   return true;
 }
 
+bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffsetM0(SDNode *N, SDValue Addr,
+                                                      SDValue &SAddr,
+                                                      SDValue &VOffset,
+                                                      SDValue &CPol) const {
+  bool ScaleOffset;
+  SDValue DummyOffset;
+  if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset,
+                         false))
+    return false;
+
+  // We are assuming CPol is second from last operand of the intrinsic.
+  auto PassedCPol =
+      N->getConstantOperandVal(N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL;
+  CPol = CurDAG->getTargetConstant(
+      (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
+  return true;
+}
+
 static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
   if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
     SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 16388e750026..4fa0d3f72e1c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -171,11 +171,16 @@ private:
   bool SelectGlobalSAddrCPol(SDNode *N, SDValue Addr, SDValue &SAddr,
                              SDValue &VOffset, SDValue &Offset,
                              SDValue &CPol) const;
+  bool SelectGlobalSAddrCPolM0(SDNode *N, SDValue Addr, SDValue &SAddr,
+                               SDValue &VOffset, SDValue &Offset,
+                               SDValue &CPol) const;
   bool SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr,
                             SDValue &VOffset, SDValue &Offset,
                             SDValue &CPol) const;
   bool SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr, SDValue &SAddr,
                                   SDValue &VOffset, SDValue &CPol) const;
+  bool SelectGlobalSAddrNoIOffsetM0(SDNode *N, SDValue Addr, SDValue &SAddr,
+                                    SDValue &VOffset, SDValue &CPol) const;
   bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
                           SDValue &Offset) const;
   bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index c048371b11d7..5c9b616e9bc2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -367,6 +367,18 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);
   setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
 
+  setTruncStoreAction(MVT::v5i32, MVT::v5i1, Expand);
+  setTruncStoreAction(MVT::v5i32, MVT::v5i8, Expand);
+  setTruncStoreAction(MVT::v5i32, MVT::v5i16, Expand);
+
+  setTruncStoreAction(MVT::v6i32, MVT::v6i1, Expand);
+  setTruncStoreAction(MVT::v6i32, MVT::v6i8, Expand);
+  setTruncStoreAction(MVT::v6i32, MVT::v6i16, Expand);
+
+  setTruncStoreAction(MVT::v7i32, MVT::v7i1, Expand);
+  setTruncStoreAction(MVT::v7i32, MVT::v7i8, Expand);
+  setTruncStoreAction(MVT::v7i32, MVT::v7i16, Expand);
+
   setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
   setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);
   setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
@@ -411,7 +423,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setOperationAction({ISD::LRINT, ISD::LLRINT}, {MVT::f16, MVT::f32, MVT::f64},
                      Expand);
 
-  setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
+  setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Expand);
 
   if (Subtarget->has16BitInsts()) {
     setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
@@ -1427,8 +1439,8 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
   case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
   case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
-  case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
-  case ISD::FREM: return LowerFREM(Op, DAG);
+  case ISD::SDIVREM:
+    return LowerSDIVREM(Op, DAG);
   case ISD::FCEIL: return LowerFCEIL(Op, DAG);
   case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
   case ISD::FRINT: return LowerFRINT(Op, DAG);
@@ -2423,21 +2435,6 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
   return DAG.getMergeValues(Res, DL);
 }
 
-// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
-SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc SL(Op);
-  EVT VT = Op.getValueType();
-  auto Flags = Op->getFlags();
-  SDValue X = Op.getOperand(0);
-  SDValue Y = Op.getOperand(1);
-
-  SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
-  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
-  SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
-  // TODO: For f32 use FMAD instead if !hasFastFMA32?
-  return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
-}
-
 SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
   SDLoc SL(Op);
   SDValue Src = Op.getOperand(0);
@@ -2650,10 +2647,7 @@ static bool valueIsKnownNeverF32Denorm(SDValue Src) {
 
 bool AMDGPUTargetLowering::allowApproxFunc(const SelectionDAG &DAG,
                                            SDNodeFlags Flags) {
-  if (Flags.hasApproximateFuncs())
-    return true;
-  auto &Options = DAG.getTarget().Options;
-  return Options.ApproxFuncFPMath;
+  return Flags.hasApproximateFuncs();
 }
 
 bool AMDGPUTargetLowering::needsDenormHandlingF32(const SelectionDAG &DAG,
@@ -2775,8 +2769,7 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
   assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
 
   const auto &Options = getTargetMachine().Options;
-  if (VT == MVT::f16 || Flags.hasApproximateFuncs() ||
-      Options.ApproxFuncFPMath) {
+  if (VT == MVT::f16 || Flags.hasApproximateFuncs()) {
 
     if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
       // Log and multiply in f32 is good enough for f16.
@@ -5674,6 +5667,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(CALL)
   NODE_NAME_CASE(TC_RETURN)
   NODE_NAME_CASE(TC_RETURN_GFX)
+  NODE_NAME_CASE(TC_RETURN_GFX_WholeWave)
   NODE_NAME_CASE(TC_RETURN_CHAIN)
   NODE_NAME_CASE(TC_RETURN_CHAIN_DVGPR)
   NODE_NAME_CASE(TRAP)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 78394ac9cd2d..bdaf48652d10 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -418,6 +418,7 @@ enum NodeType : unsigned {
   CALL,
   TC_RETURN,
   TC_RETURN_GFX,
+  TC_RETURN_GFX_WholeWave,
   TC_RETURN_CHAIN,
   TC_RETURN_CHAIN_DVGPR,
   TRAP,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index e305f08925cc..b8fa6f3fc686 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -94,6 +94,10 @@ def AMDGPUtc_return_gfx: SDNode<"AMDGPUISD::TC_RETURN_GFX", AMDGPUTCReturnTP,
 [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
 >;
 
+def AMDGPUtc_return_gfx_ww: SDNode<"AMDGPUISD::TC_RETURN_GFX_WholeWave", AMDGPUTCReturnTP,
+[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
+>;
+
 def AMDGPUtc_return_chain: SDNode<"AMDGPUISD::TC_RETURN_CHAIN",
   SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
   [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 5d31eed8fe7d..12915c734442 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1989,39 +1989,6 @@ bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
   return selectImpl(MI, *CoverageInfo);
 }
 
-bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
-  Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
-  if (TM.getOptLevel() > CodeGenOptLevel::None) {
-    unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
-    if (WGSize <= STI.getWavefrontSize()) {
-      // If the workgroup fits in a wave, remove s_barrier_signal and lower
-      // s_barrier/s_barrier_wait to wave_barrier.
-      if (IntrinsicID == Intrinsic::amdgcn_s_barrier ||
-          IntrinsicID == Intrinsic::amdgcn_s_barrier_wait) {
-        MachineBasicBlock *MBB = MI.getParent();
-        const DebugLoc &DL = MI.getDebugLoc();
-        BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
-      }
-      MI.eraseFromParent();
-      return true;
-    }
-  }
-
-  if (STI.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
-    // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
-    MachineBasicBlock *MBB = MI.getParent();
-    const DebugLoc &DL = MI.getDebugLoc();
-    BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
-        .addImm(AMDGPU::Barrier::WORKGROUP);
-    BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT))
-        .addImm(AMDGPU::Barrier::WORKGROUP);
-    MI.eraseFromParent();
-    return true;
-  }
-
-  return selectImpl(MI, *CoverageInfo);
-}
-
 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
                          bool &IsTexFail) {
   if (TexFailCtrl)
@@ -2338,10 +2305,6 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
     return selectDSAppendConsume(I, false);
   case Intrinsic::amdgcn_init_whole_wave:
     return selectInitWholeWave(I);
-  case Intrinsic::amdgcn_s_barrier:
-  case Intrinsic::amdgcn_s_barrier_signal:
-  case Intrinsic::amdgcn_s_barrier_wait:
-    return selectSBarrier(I);
   case Intrinsic::amdgcn_raw_buffer_load_lds:
   case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
   case Intrinsic::amdgcn_struct_buffer_load_lds:
@@ -5746,6 +5709,16 @@ AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const {
 }
 
 InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectGlobalSAddrCPolM0(MachineOperand &Root) const {
+  const MachineInstr &I = *Root.getParent();
+
+  // We are assuming CPol is second from last operand of the intrinsic.
+  auto PassedCPol =
+      I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
+  return selectGlobalSAddr(Root, PassedCPol);
+}
+
+InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
   return selectGlobalSAddr(Root, AMDGPU::CPol::GLC);
 }
@@ -5762,6 +5735,17 @@ AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset(
 }
 
 InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectGlobalSAddrNoIOffsetM0(
+    MachineOperand &Root) const {
+  const MachineInstr &I = *Root.getParent();
+
+  // We are assuming CPol is second from last operand of the intrinsic.
+  auto PassedCPol =
+      I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
+  return selectGlobalSAddr(Root, PassedCPol, false);
+}
+
+InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
   Register Addr = Root.getReg();
   Register PtrBase;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 092439693f39..c760fe7ef99d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -124,7 +124,6 @@ private:
   bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
   bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;
   bool selectInitWholeWave(MachineInstr &MI) const;
-  bool selectSBarrier(MachineInstr &MI) const;
   bool selectDSBvhStackIntrinsic(MachineInstr &MI) const;
 
   bool selectImageIntrinsic(MachineInstr &MI,
@@ -257,9 +256,13 @@ private:
   InstructionSelector::ComplexRendererFns
   selectGlobalSAddrCPol(MachineOperand &Root) const;
   InstructionSelector::ComplexRendererFns
+  selectGlobalSAddrCPolM0(MachineOperand &Root) const;
+  InstructionSelector::ComplexRendererFns
   selectGlobalSAddrGLC(MachineOperand &Root) const;
   InstructionSelector::ComplexRendererFns
   selectGlobalSAddrNoIOffset(MachineOperand &Root) const;
+  InstructionSelector::ComplexRendererFns
+  selectGlobalSAddrNoIOffsetM0(MachineOperand &Root) const;
 
   InstructionSelector::ComplexRendererFns
   selectScratchSAddr(MachineOperand &Root) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index efcd87e46620..bd443b5b6f1e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -509,6 +509,10 @@ def atomic_load_nonext_64_#as : PatFrag<(ops node:$ptr), (atomic_load_nonext_64
   let IsAtomic = 1;
 }
 
+def atomic_load_nonext_128_#as : PatFrag<(ops node:$ptr), (atomic_load_nonext_128 node:$ptr)> {
+  let IsAtomic = 1;
+}
+
 def atomic_load_zext_8_#as : PatFrag<(ops node:$ptr), (atomic_load_zext_8 node:$ptr)> {
   let IsAtomic = 1;
 }
@@ -573,6 +577,8 @@ def atomic_store_32_#as : PatFrag<(ops node:$val, node:$ptr),
                                   (atomic_store_32 node:$val, node:$ptr)>;
 def atomic_store_64_#as : PatFrag<(ops node:$val, node:$ptr),
                                   (atomic_store_64 node:$val, node:$ptr)>;
+def atomic_store_128_#as : PatFrag<(ops node:$val, node:$ptr),
+                                   (atomic_store_128 node:$val, node:$ptr)>;
 } // End let IsAtomic = 1, AddressSpaces = ...
 } // End foreach as
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 600a13096f55..f18536cd4ab9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2082,13 +2082,12 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     .scalarize(0)
     .lower();
 
-  // TODO: Only Try to form v2s16 with legal packed instructions.
-  getActionDefinitionsBuilder(G_FSHR)
-    .legalFor({{S32, S32}})
-    .lowerFor({{V2S16, V2S16}})
-    .clampMaxNumElementsStrict(0, S16, 2)
-    .scalarize(0)
-    .lower();
+  auto &FSHRActionDefs = getActionDefinitionsBuilder(G_FSHR);
+  FSHRActionDefs.legalFor({{S32, S32}})
+                              .clampMaxNumElementsStrict(0, S16, 2);
+  if (ST.hasVOP3PInsts())
+    FSHRActionDefs.lowerFor({{V2S16, V2S16}});
+  FSHRActionDefs.scalarize(0).lower();
 
   if (ST.hasVOP3PInsts()) {
     getActionDefinitionsBuilder(G_FSHL)
@@ -3414,10 +3413,7 @@ static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
 }
 
 static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
-  if (Flags & MachineInstr::FmAfn)
-    return true;
-  const auto &Options = MF.getTarget().Options;
-  return Options.ApproxFuncFPMath;
+  return Flags & MachineInstr::FmAfn;
 }
 
 static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src,
@@ -3522,8 +3518,7 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
   const AMDGPUTargetMachine &TM =
       static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
 
-  if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) ||
-      TM.Options.ApproxFuncFPMath) {
+  if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn)) {
     if (Ty == F16 && !ST.has16BitInsts()) {
       Register LogVal = MRI.createGenericVirtualRegister(F32);
       auto PromoteSrc = B.buildFPExt(F32, X);
@@ -7823,6 +7818,20 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
     MI.eraseFromParent();
     return true;
   }
+  case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
+  case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
+  case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
+    assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
+    B.buildLoad(MI.getOperand(0), MI.getOperand(2), **MI.memoperands_begin());
+    MI.eraseFromParent();
+    return true;
+  case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
+  case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
+  case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
+    assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
+    B.buildStore(MI.getOperand(2), MI.getOperand(1), **MI.memoperands_begin());
+    MI.eraseFromParent();
+    return true;
   default: {
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
             AMDGPU::getImageDimIntrinsicInfo(IntrID))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
new file mode 100644
index 000000000000..a30d9cb0412a
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
@@ -0,0 +1,161 @@
+//===-- AMDGPULowerIntrinsics.cpp -------------------------------------------=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Lower intrinsics that would otherwise require separate handling in both
+// SelectionDAG and GlobalISel.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "GCNSubtarget.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/InitializePasses.h"
+
+#define DEBUG_TYPE "amdgpu-lower-intrinsics"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPULowerIntrinsicsImpl {
+public:
+  Module &M;
+  const AMDGPUTargetMachine &TM;
+
+  AMDGPULowerIntrinsicsImpl(Module &M, const AMDGPUTargetMachine &TM)
+      : M(M), TM(TM) {}
+
+  bool run();
+
+private:
+  bool visitBarrier(IntrinsicInst &I);
+};
+
+class AMDGPULowerIntrinsicsLegacy : public ModulePass {
+public:
+  static char ID;
+
+  AMDGPULowerIntrinsicsLegacy() : ModulePass(ID) {}
+
+  bool runOnModule(Module &M) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetPassConfig>();
+    AU.setPreservesCFG();
+  }
+};
+
+template <class T> static void forEachCall(Function &Intrin, T Callback) {
+  for (User *U : make_early_inc_range(Intrin.users())) {
+    if (auto *CI = dyn_cast<IntrinsicInst>(U))
+      Callback(CI);
+  }
+}
+
+} // anonymous namespace
+
+bool AMDGPULowerIntrinsicsImpl::run() {
+  bool Changed = false;
+
+  for (Function &F : M) {
+    switch (F.getIntrinsicID()) {
+    default:
+      continue;
+    case Intrinsic::amdgcn_s_barrier:
+    case Intrinsic::amdgcn_s_barrier_signal:
+    case Intrinsic::amdgcn_s_barrier_signal_isfirst:
+    case Intrinsic::amdgcn_s_barrier_wait:
+      forEachCall(F, [&](IntrinsicInst *II) { Changed |= visitBarrier(*II); });
+      break;
+    }
+  }
+
+  return Changed;
+}
+
+// Optimize barriers and lower s_barrier to a sequence of split barrier
+// intrinsics.
+bool AMDGPULowerIntrinsicsImpl::visitBarrier(IntrinsicInst &I) {
+  assert(I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier ||
+         I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal ||
+         I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal_isfirst ||
+         I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait);
+
+  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(*I.getFunction());
+  bool IsSingleWaveWG = false;
+
+  if (TM.getOptLevel() > CodeGenOptLevel::None) {
+    unsigned WGMaxSize = ST.getFlatWorkGroupSizes(*I.getFunction()).second;
+    IsSingleWaveWG = WGMaxSize <= ST.getWavefrontSize();
+  }
+
+  IRBuilder<> B(&I);
+
+  if (IsSingleWaveWG) {
+    // Down-grade waits, remove split signals.
+    if (I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier ||
+        I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait) {
+      B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_wave_barrier, {});
+    } else if (I.getIntrinsicID() ==
+               Intrinsic::amdgcn_s_barrier_signal_isfirst) {
+      // If we're the only wave of the workgroup, we're always first.
+      I.replaceAllUsesWith(B.getInt1(true));
+    }
+    I.eraseFromParent();
+    return true;
+  }
+
+  if (I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier &&
+      ST.hasSplitBarriers()) {
+    // Lower to split barriers.
+    Value *BarrierID_32 = B.getInt32(AMDGPU::Barrier::WORKGROUP);
+    Value *BarrierID_16 = B.getInt16(AMDGPU::Barrier::WORKGROUP);
+    B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_signal,
+                      {BarrierID_32});
+    B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_wait,
+                      {BarrierID_16});
+    I.eraseFromParent();
+    return true;
+  }
+
+  return false;
+}
+
+PreservedAnalyses AMDGPULowerIntrinsicsPass::run(Module &M,
+                                                 ModuleAnalysisManager &MAM) {
+  AMDGPULowerIntrinsicsImpl Impl(M, TM);
+  if (!Impl.run())
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
+
+bool AMDGPULowerIntrinsicsLegacy::runOnModule(Module &M) {
+  auto &TPC = getAnalysis<TargetPassConfig>();
+  const AMDGPUTargetMachine &TM = TPC.getTM<AMDGPUTargetMachine>();
+
+  AMDGPULowerIntrinsicsImpl Impl(M, TM);
+  return Impl.run();
+}
+
+#define PASS_DESC "AMDGPU lower intrinsics"
+INITIALIZE_PASS_BEGIN(AMDGPULowerIntrinsicsLegacy, DEBUG_TYPE, PASS_DESC, false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(AMDGPULowerIntrinsicsLegacy, DEBUG_TYPE, PASS_DESC, false,
+                    false)
+
+char AMDGPULowerIntrinsicsLegacy::ID = 0;
+
+ModulePass *llvm::createAMDGPULowerIntrinsicsLegacyPass() {
+  return new AMDGPULowerIntrinsicsLegacy;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
new file mode 100644
index 000000000000..1e6589eb42c1
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
@@ -0,0 +1,373 @@
+//===- AMDGPULowerVGPREncoding.cpp - lower VGPRs above v255 ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Lower VGPRs above first 256 on gfx1250.
+///
+/// The pass scans used VGPRs and inserts S_SET_VGPR_MSB instructions to switch
+/// VGPR addressing mode. The mode change is effective until the next change.
+/// This instruction provides high bits of a VGPR address for four of the
+/// operands: vdst, src0, src1, and src2, or other 4 operands depending on the
+/// instruction encoding. If bits are set they are added as MSB to the
+/// corresponding operand VGPR number.
+///
+/// There is no need to replace actual register operands because encoding of the
+/// high and low VGPRs is the same. I.e. v0 has the encoding 0x100, so does
+/// v256. v1 has the encoding 0x101 and v257 has the same encoding. So high
+/// VGPRs will survive until actual encoding and will result in a same actual
+/// bit encoding.
+///
+/// As a result the pass only inserts S_SET_VGPR_MSB to provide an actual offset
+/// to a VGPR address of the subseqent instructions. The InstPrinter will take
+/// care of the printing a low VGPR instead of a high one. In prinicple this
+/// shall be viable to print actual high VGPR numbers, but that would disagree
+/// with a disasm printing and create a situation where asm text is not
+/// deterministic.
+///
+/// This pass creates a convention where non-fall through basic blocks shall
+/// start with all 4 MSBs zero. Otherwise a disassembly would not be readable.
+/// An optimization here is possible but deemed not desirable because of the
+/// readbility concerns.
+///
+/// Consequentially the ABI is set to expect all 4 MSBs to be zero on entry.
+/// The pass must run very late in the pipeline to make sure no changes to VGPR
+/// operands will be made after it.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPULowerVGPREncoding.h"
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/PackedVector.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-lower-vgpr-encoding"
+
+namespace {
+
+class AMDGPULowerVGPREncoding {
+  static constexpr unsigned OpNum = 4;
+  static constexpr unsigned BitsPerField = 2;
+  static constexpr unsigned NumFields = 4;
+  static constexpr unsigned FieldMask = (1 << BitsPerField) - 1;
+  using ModeType = PackedVector<unsigned, BitsPerField,
+                                std::bitset<BitsPerField * NumFields>>;
+
+  class ModeTy : public ModeType {
+  public:
+    // bitset constructor will set all bits to zero
+    ModeTy() : ModeType(0) {}
+
+    operator int64_t() const { return raw_bits().to_ulong(); }
+
+    static ModeTy fullMask() {
+      ModeTy M;
+      M.raw_bits().flip();
+      return M;
+    }
+  };
+
+public:
+  bool run(MachineFunction &MF);
+
+private:
+  const SIInstrInfo *TII;
+  const SIRegisterInfo *TRI;
+
+  /// Most recent s_set_* instruction.
+  MachineInstr *MostRecentModeSet;
+
+  /// Whether the current mode is known.
+  bool CurrentModeKnown;
+
+  /// Current mode bits.
+  ModeTy CurrentMode;
+
+  /// Current mask of mode bits that instructions since MostRecentModeSet care
+  /// about.
+  ModeTy CurrentMask;
+
+  /// Number of current hard clause instructions.
+  unsigned ClauseLen;
+
+  /// Number of hard clause instructions remaining.
+  unsigned ClauseRemaining;
+
+  /// Clause group breaks.
+  unsigned ClauseBreaks;
+
+  /// Last hard clause instruction.
+  MachineInstr *Clause;
+
+  /// Insert mode change before \p I. \returns true if mode was changed.
+  bool setMode(ModeTy NewMode, ModeTy Mask, MachineInstr *I);
+
+  /// Reset mode to default.
+  void resetMode(MachineInstr *I) { setMode(ModeTy(), ModeTy::fullMask(), I); }
+
+  /// If \p MO references VGPRs, return the MSBs. Otherwise, return nullopt.
+  std::optional<unsigned> getMSBs(const MachineOperand &MO) const;
+
+  /// Handle single \p MI. \return true if changed.
+  bool runOnMachineInstr(MachineInstr &MI);
+
+  /// Compute the mode and mode mask for a single \p MI given \p Ops operands
+  /// bit mapping. Optionally takes second array \p Ops2 for VOPD.
+  /// If provided and an operand from \p Ops is not a VGPR, then \p Ops2
+  /// is checked.
+  void computeMode(ModeTy &NewMode, ModeTy &Mask, MachineInstr &MI,
+                   const AMDGPU::OpName Ops[OpNum],
+                   const AMDGPU::OpName *Ops2 = nullptr);
+
+  /// Check if an instruction \p I is within a clause and returns a suitable
+  /// iterator to insert mode change. It may also modify the S_CLAUSE
+  /// instruction to extend it or drop the clause if it cannot be adjusted.
+  MachineInstr *handleClause(MachineInstr *I);
+};
+
+bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode, ModeTy Mask,
+                                      MachineInstr *I) {
+  assert((NewMode.raw_bits() & ~Mask.raw_bits()).none());
+
+  if (CurrentModeKnown) {
+    auto Delta = NewMode.raw_bits() ^ CurrentMode.raw_bits();
+
+    if ((Delta & Mask.raw_bits()).none()) {
+      CurrentMask |= Mask;
+      return false;
+    }
+
+    if (MostRecentModeSet && (Delta & CurrentMask.raw_bits()).none()) {
+      CurrentMode |= NewMode;
+      CurrentMask |= Mask;
+
+      MostRecentModeSet->getOperand(0).setImm(CurrentMode);
+      return true;
+    }
+  }
+
+  I = handleClause(I);
+  MostRecentModeSet =
+      BuildMI(*I->getParent(), I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB))
+          .addImm(NewMode);
+
+  CurrentMode = NewMode;
+  CurrentMask = Mask;
+  CurrentModeKnown = true;
+  return true;
+}
+
+std::optional<unsigned>
+AMDGPULowerVGPREncoding::getMSBs(const MachineOperand &MO) const {
+  if (!MO.isReg())
+    return std::nullopt;
+
+  MCRegister Reg = MO.getReg();
+  const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
+  if (!RC || !TRI->isVGPRClass(RC))
+    return std::nullopt;
+
+  unsigned Idx = TRI->getHWRegIndex(Reg);
+  return Idx >> 8;
+}
+
+void AMDGPULowerVGPREncoding::computeMode(ModeTy &NewMode, ModeTy &Mask,
+                                          MachineInstr &MI,
+                                          const AMDGPU::OpName Ops[OpNum],
+                                          const AMDGPU::OpName *Ops2) {
+  NewMode = {};
+  Mask = {};
+
+  for (unsigned I = 0; I < OpNum; ++I) {
+    MachineOperand *Op = TII->getNamedOperand(MI, Ops[I]);
+
+    std::optional<unsigned> MSBits;
+    if (Op)
+      MSBits = getMSBs(*Op);
+
+#if !defined(NDEBUG)
+    if (MSBits.has_value() && Ops2) {
+      auto Op2 = TII->getNamedOperand(MI, Ops2[I]);
+      if (Op2) {
+        std::optional<unsigned> MSBits2;
+        MSBits2 = getMSBs(*Op2);
+        if (MSBits2.has_value() && MSBits != MSBits2)
+          llvm_unreachable("Invalid VOPD pair was created");
+      }
+    }
+#endif
+
+    if (!MSBits.has_value() && Ops2) {
+      Op = TII->getNamedOperand(MI, Ops2[I]);
+      if (Op)
+        MSBits = getMSBs(*Op);
+    }
+
+    if (!MSBits.has_value())
+      continue;
+
+    // Skip tied uses of src2 of VOP2, these will be handled along with defs and
+    // only vdst bit affects these operands. We cannot skip tied uses of VOP3,
+    // these uses are real even if must match the vdst.
+    if (Ops[I] == AMDGPU::OpName::src2 && !Op->isDef() && Op->isTied() &&
+        (SIInstrInfo::isVOP2(MI) ||
+         (SIInstrInfo::isVOP3(MI) &&
+          TII->hasVALU32BitEncoding(MI.getOpcode()))))
+      continue;
+
+    NewMode[I] = MSBits.value();
+    Mask[I] = FieldMask;
+  }
+}
+
+bool AMDGPULowerVGPREncoding::runOnMachineInstr(MachineInstr &MI) {
+  auto Ops = AMDGPU::getVGPRLoweringOperandTables(MI.getDesc());
+  if (Ops.first) {
+    ModeTy NewMode, Mask;
+    computeMode(NewMode, Mask, MI, Ops.first, Ops.second);
+    return setMode(NewMode, Mask, &MI);
+  }
+  assert(!TII->hasVGPRUses(MI) || MI.isMetaInstruction() || MI.isPseudo());
+
+  return false;
+}
+
+MachineInstr *AMDGPULowerVGPREncoding::handleClause(MachineInstr *I) {
+  if (!ClauseRemaining)
+    return I;
+
+  // A clause cannot start with a special instruction, place it right before
+  // the clause.
+  if (ClauseRemaining == ClauseLen) {
+    I = Clause->getPrevNode();
+    assert(I->isBundle());
+    return I;
+  }
+
+  // If a clause defines breaks each group cannot start with a mode change.
+  // just drop the clause.
+  if (ClauseBreaks) {
+    Clause->eraseFromBundle();
+    ClauseRemaining = 0;
+    return I;
+  }
+
+  // Otherwise adjust a number of instructions in the clause if it fits.
+  // If it does not clause will just become shorter. Since the length
+  // recorded in the clause is one less, increment the length after the
+  // update. Note that SIMM16[5:0] must be 1-62, not 0 or 63.
+  if (ClauseLen < 63)
+    Clause->getOperand(0).setImm(ClauseLen | (ClauseBreaks << 8));
+
+  ++ClauseLen;
+
+  return I;
+}
+
+bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  if (!ST.has1024AddressableVGPRs())
+    return false;
+
+  TII = ST.getInstrInfo();
+  TRI = ST.getRegisterInfo();
+
+  bool Changed = false;
+  ClauseLen = ClauseRemaining = 0;
+  CurrentMode.reset();
+  CurrentMask.reset();
+  CurrentModeKnown = true;
+  for (auto &MBB : MF) {
+    MostRecentModeSet = nullptr;
+
+    for (auto &MI : llvm::make_early_inc_range(MBB.instrs())) {
+      if (MI.isMetaInstruction())
+        continue;
+
+      if (MI.isTerminator() || MI.isCall()) {
+        if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
+            MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
+          CurrentMode.reset();
+          CurrentModeKnown = true;
+        } else
+          resetMode(&MI);
+        continue;
+      }
+
+      if (MI.isInlineAsm()) {
+        if (TII->hasVGPRUses(MI))
+          resetMode(&MI);
+        continue;
+      }
+
+      if (MI.getOpcode() == AMDGPU::S_CLAUSE) {
+        assert(!ClauseRemaining && "Nested clauses are not supported");
+        ClauseLen = MI.getOperand(0).getImm();
+        ClauseBreaks = (ClauseLen >> 8) & 15;
+        ClauseLen = ClauseRemaining = (ClauseLen & 63) + 1;
+        Clause = &MI;
+        continue;
+      }
+
+      Changed |= runOnMachineInstr(MI);
+
+      if (ClauseRemaining)
+        --ClauseRemaining;
+    }
+
+    // If we're falling through to a block that has at least one other
+    // predecessor, we no longer know the mode.
+    MachineBasicBlock *Next = MBB.getNextNode();
+    if (Next && Next->pred_size() >= 2 &&
+        llvm::is_contained(Next->predecessors(), &MBB)) {
+      if (CurrentMode.raw_bits().any())
+        CurrentModeKnown = false;
+    }
+  }
+
+  return Changed;
+}
+
+class AMDGPULowerVGPREncodingLegacy : public MachineFunctionPass {
+public:
+  static char ID;
+
+  AMDGPULowerVGPREncodingLegacy() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    return AMDGPULowerVGPREncoding().run(MF);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // namespace
+
+char AMDGPULowerVGPREncodingLegacy::ID = 0;
+
+char &llvm::AMDGPULowerVGPREncodingLegacyID = AMDGPULowerVGPREncodingLegacy::ID;
+
+INITIALIZE_PASS(AMDGPULowerVGPREncodingLegacy, DEBUG_TYPE,
+                "AMDGPU Lower VGPR Encoding", false, false)
+
+PreservedAnalyses
+AMDGPULowerVGPREncodingPass::run(MachineFunction &MF,
+                                 MachineFunctionAnalysisManager &MFAM) {
+  if (!AMDGPULowerVGPREncoding().run(MF))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.h b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.h
new file mode 100644
index 000000000000..c8c2051c9fdd
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.h
@@ -0,0 +1,25 @@
+//===--- AMDGPULowerVGPREncoding.h ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPULOWERVGPRENCODING_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPULOWERVGPRENCODING_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+
+class AMDGPULowerVGPREncodingPass
+    : public PassInfoMixin<AMDGPULowerVGPREncodingPass> {
+public:
+  PreservedAnalyses run(MachineFunction &MF,
+                        MachineFunctionAnalysisManager &MFAM);
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPULOWERVGPRENCODING_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index c84a0f6e3138..6acbf52b97de 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -373,6 +373,13 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
                              MF->getInfo<SIMachineFunctionInfo>(),
                              *OutStreamer);
 
+    if (isVerbose() && MI->getOpcode() == AMDGPU::S_SET_VGPR_MSB) {
+      unsigned V = MI->getOperand(0).getImm();
+      OutStreamer->AddComment(
+          " msbs: dst=" + Twine(V >> 6) + " src0=" + Twine(V & 3) +
+          " src1=" + Twine((V >> 2) & 3) + " src2=" + Twine((V >> 4) & 3));
+    }
+
     MCInst TmpInst;
     MCInstLowering.lower(MI, TmpInst);
     EmitToStreamer(*OutStreamer, TmpInst);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
index eda479064d7b..d09b7cffe9f2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
@@ -23,6 +23,7 @@ AMDGPUMachineModuleInfo::AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI)
   AgentSSID = CTX.getOrInsertSyncScopeID("agent");
   WorkgroupSSID = CTX.getOrInsertSyncScopeID("workgroup");
   WavefrontSSID = CTX.getOrInsertSyncScopeID("wavefront");
+  ClusterSSID = CTX.getOrInsertSyncScopeID("cluster");
   SystemOneAddressSpaceSSID =
       CTX.getOrInsertSyncScopeID("one-as");
   AgentOneAddressSpaceSSID =
@@ -33,4 +34,5 @@ AMDGPUMachineModuleInfo::AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI)
       CTX.getOrInsertSyncScopeID("wavefront-one-as");
   SingleThreadOneAddressSpaceSSID =
       CTX.getOrInsertSyncScopeID("singlethread-one-as");
+  ClusterOneAddressSpaceSSID = CTX.getOrInsertSyncScopeID("cluster-one-as");
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
index 5c2ecaa65714..bf852bb38376 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
@@ -32,6 +32,8 @@ private:
   SyncScope::ID WorkgroupSSID;
   /// Wavefront synchronization scope ID (cross address space).
   SyncScope::ID WavefrontSSID;
+  /// Cluster synchronization scope ID (cross address space).
+  SyncScope::ID ClusterSSID;
   /// System synchronization scope ID (single address space).
   SyncScope::ID SystemOneAddressSpaceSSID;
   /// Agent synchronization scope ID (single address space).
@@ -42,6 +44,8 @@ private:
   SyncScope::ID WavefrontOneAddressSpaceSSID;
   /// Single thread synchronization scope ID (single address space).
   SyncScope::ID SingleThreadOneAddressSpaceSSID;
+  /// Cluster synchronization scope ID (single address space).
+  SyncScope::ID ClusterOneAddressSpaceSSID;
 
   /// In AMDGPU target synchronization scopes are inclusive, meaning a
   /// larger synchronization scope is inclusive of a smaller synchronization
@@ -60,12 +64,15 @@ private:
     else if (SSID == getWorkgroupSSID() ||
              SSID == getWorkgroupOneAddressSpaceSSID())
       return 2;
+    else if (SSID == getClusterSSID() ||
+             SSID == getClusterOneAddressSpaceSSID())
+      return 3;
     else if (SSID == getAgentSSID() ||
              SSID == getAgentOneAddressSpaceSSID())
-      return 3;
+      return 4;
     else if (SSID == SyncScope::System ||
              SSID == getSystemOneAddressSpaceSSID())
-      return 4;
+      return 5;
 
     return std::nullopt;
   }
@@ -73,11 +80,12 @@ private:
   /// \returns True if \p SSID is restricted to single address space, false
   /// otherwise
   bool isOneAddressSpace(SyncScope::ID SSID) const {
-    return SSID == getSingleThreadOneAddressSpaceSSID() ||
-        SSID == getWavefrontOneAddressSpaceSSID() ||
-        SSID == getWorkgroupOneAddressSpaceSSID() ||
-        SSID == getAgentOneAddressSpaceSSID() ||
-        SSID == getSystemOneAddressSpaceSSID();
+    return SSID == getClusterOneAddressSpaceSSID() ||
+           SSID == getSingleThreadOneAddressSpaceSSID() ||
+           SSID == getWavefrontOneAddressSpaceSSID() ||
+           SSID == getWorkgroupOneAddressSpaceSSID() ||
+           SSID == getAgentOneAddressSpaceSSID() ||
+           SSID == getSystemOneAddressSpaceSSID();
   }
 
 public:
@@ -95,6 +103,8 @@ public:
   SyncScope::ID getWavefrontSSID() const {
     return WavefrontSSID;
   }
+  /// \returns Cluster synchronization scope ID (cross address space).
+  SyncScope::ID getClusterSSID() const { return ClusterSSID; }
   /// \returns System synchronization scope ID (single address space).
   SyncScope::ID getSystemOneAddressSpaceSSID() const {
     return SystemOneAddressSpaceSSID;
@@ -115,6 +125,10 @@ public:
   SyncScope::ID getSingleThreadOneAddressSpaceSSID() const {
     return SingleThreadOneAddressSpaceSSID;
   }
+  /// \returns Single thread synchronization scope ID (single address space).
+  SyncScope::ID getClusterOneAddressSpaceSSID() const {
+    return ClusterOneAddressSpaceSSID;
+  }
 
   /// In AMDGPU target synchronization scopes are inclusive, meaning a
   /// larger synchronization scope is inclusive of a smaller synchronization
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 6ddfa386e8ac..9449e7093091 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -20,6 +20,7 @@ MODULE_PASS("amdgpu-always-inline", AMDGPUAlwaysInlinePass())
 MODULE_PASS("amdgpu-export-kernel-runtime-handles", AMDGPUExportKernelRuntimeHandlesPass())
 MODULE_PASS("amdgpu-lower-buffer-fat-pointers",
             AMDGPULowerBufferFatPointersPass(*this))
+MODULE_PASS("amdgpu-lower-intrinsics", AMDGPULowerIntrinsicsPass(*this))
 MODULE_PASS("amdgpu-lower-ctor-dtor", AMDGPUCtorDtorLoweringPass())
 MODULE_PASS("amdgpu-lower-module-lds", AMDGPULowerModuleLDSPass(*this))
 MODULE_PASS("amdgpu-perf-hint",
@@ -105,6 +106,7 @@ MACHINE_FUNCTION_ANALYSIS("amdgpu-resource-usage", AMDGPUResourceUsageAnalysis(*
 #endif
 MACHINE_FUNCTION_PASS("amdgpu-insert-delay-alu", AMDGPUInsertDelayAluPass())
 MACHINE_FUNCTION_PASS("amdgpu-isel", AMDGPUISelDAGToDAGPass(*this))
+MACHINE_FUNCTION_PASS("amdgpu-lower-vgpr-encoding", AMDGPULowerVGPREncodingPass())
 MACHINE_FUNCTION_PASS("amdgpu-mark-last-scratch-load", AMDGPUMarkLastScratchLoadPass())
 MACHINE_FUNCTION_PASS("amdgpu-pre-ra-long-branch-reg", GCNPreRALongBranchRegPass())
 MACHINE_FUNCTION_PASS("amdgpu-reserve-wwm-regs", AMDGPUReserveWWMRegsPass())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index f226c7f381aa..7dbe1235a98b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -70,7 +70,7 @@ static cl::opt<unsigned> PromoteAllocaToVectorMaxRegs(
     "amdgpu-promote-alloca-to-vector-max-regs",
     cl::desc(
         "Maximum vector size (in 32b registers) to use when promoting alloca"),
-    cl::init(16));
+    cl::init(32));
 
 // Use up to 1/4 of available register budget for vectorization.
 // FIXME: Increase the limit for whole function budgets? Perhaps x2?
@@ -287,8 +287,12 @@ void AMDGPUPromoteAllocaImpl::sortAllocasToPromote(
 
 void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
   // Load per function limits, overriding with global options where appropriate.
+  // R600 register tuples/aliasing are fragile with large vector promotions so
+  // apply architecture specific limit here.
+  const int R600MaxVectorRegs = 16;
   MaxVectorRegs = F.getFnAttributeAsParsedInteger(
-      "amdgpu-promote-alloca-to-vector-max-regs", PromoteAllocaToVectorMaxRegs);
+      "amdgpu-promote-alloca-to-vector-max-regs",
+      IsAMDGCN ? PromoteAllocaToVectorMaxRegs : R600MaxVectorRegs);
   if (PromoteAllocaToVectorMaxRegs.getNumOccurrences())
     MaxVectorRegs = PromoteAllocaToVectorMaxRegs;
   VGPRBudgetRatio = F.getFnAttributeAsParsedInteger(
@@ -439,9 +443,10 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
     return nullptr;
 
   APInt IndexQuot;
-  uint64_t Rem;
-  APInt::udivrem(ConstOffset, VecElemSize, IndexQuot, Rem);
-  if (Rem != 0)
+  APInt Rem;
+  APInt::sdivrem(ConstOffset, APInt(ConstOffset.getBitWidth(), VecElemSize),
+                 IndexQuot, Rem);
+  if (!Rem.isZero())
     return nullptr;
   if (VarOffsets.size() == 0)
     return ConstantInt::get(GEP->getContext(), IndexQuot);
@@ -450,8 +455,10 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
 
   const auto &VarOffset = VarOffsets.front();
   APInt OffsetQuot;
-  APInt::udivrem(VarOffset.second, VecElemSize, OffsetQuot, Rem);
-  if (Rem != 0 || OffsetQuot.isZero())
+  APInt::sdivrem(VarOffset.second,
+                 APInt(VarOffset.second.getBitWidth(), VecElemSize), OffsetQuot,
+                 Rem);
+  if (!Rem.isZero() || OffsetQuot.isZero())
     return nullptr;
 
   Value *Offset = VarOffset.first;
@@ -461,7 +468,7 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
 
   if (!OffsetQuot.isOne()) {
     ConstantInt *ConstMul =
-        ConstantInt::get(OffsetType, OffsetQuot.getZExtValue());
+        ConstantInt::get(OffsetType, OffsetQuot.getSExtValue());
     Offset = Builder.CreateMul(Offset, ConstMul);
     if (Instruction *NewInst = dyn_cast<Instruction>(Offset))
       NewInsts.push_back(NewInst);
@@ -470,8 +477,8 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
     return Offset;
 
   ConstantInt *ConstIndex =
-      ConstantInt::get(OffsetType, IndexQuot.getZExtValue());
-  Value *IndexAdd = Builder.CreateAdd(ConstIndex, Offset);
+      ConstantInt::get(OffsetType, IndexQuot.getSExtValue());
+  Value *IndexAdd = Builder.CreateAdd(Offset, ConstIndex);
   if (Instruction *NewInst = dyn_cast<Instruction>(IndexAdd))
     NewInsts.push_back(NewInst);
   return IndexAdd;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 237929699dd9..36b27bef350e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3322,6 +3322,14 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
       constrainOpWithReadfirstlane(B, MI, 6); // soffset
       return;
     }
+    case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
+    case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
+    case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
+    case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
+      applyDefaultMapping(OpdMapper);
+      constrainOpWithReadfirstlane(B, MI, 5);
+      return;
+    }
     case Intrinsic::amdgcn_load_to_lds:
     case Intrinsic::amdgcn_global_load_lds: {
       applyDefaultMapping(OpdMapper);
@@ -3338,6 +3346,13 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
       applyDefaultMapping(OpdMapper);
       constrainOpWithReadfirstlane(B, MI, 8); // M0
       return;
+    case Intrinsic::amdgcn_cluster_load_b32:
+    case Intrinsic::amdgcn_cluster_load_b64:
+    case Intrinsic::amdgcn_cluster_load_b128: {
+      applyDefaultMapping(OpdMapper);
+      constrainOpWithReadfirstlane(B, MI, 4); // M0
+      return;
+    }
     case Intrinsic::amdgcn_s_sleep_var:
       assert(OpdMapper.getVRegs(1).empty());
       constrainOpWithReadfirstlane(B, MI, 1);
@@ -5466,6 +5481,27 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
       break;
     }
+    case Intrinsic::amdgcn_cluster_load_b32:
+    case Intrinsic::amdgcn_cluster_load_b64:
+    case Intrinsic::amdgcn_cluster_load_b128: {
+      OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+      OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+      unsigned M0Bank =
+          getRegBankID(MI.getOperand(4).getReg(), MRI, AMDGPU::SGPRRegBankID);
+      OpdsMapping[4] = AMDGPU::getValueMapping(M0Bank, 32);
+      break;
+    }
+    case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
+    case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
+    case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
+    case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
+      OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+      OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+      unsigned M0Bank =
+          getRegBankID(MI.getOperand(5).getReg(), MRI, AMDGPU::SGPRRegBankID);
+      OpdsMapping[5] = AMDGPU::getValueMapping(M0Bank, 32);
+      break;
+    }
     case Intrinsic::amdgcn_global_store_async_from_lds_b8:
     case Intrinsic::amdgcn_global_store_async_from_lds_b32:
     case Intrinsic::amdgcn_global_store_async_from_lds_b64:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
index 8b1d4ba68a44..21cf9cc6878f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
@@ -14,6 +14,10 @@
 /// MFMA opcode.
 ///
 /// TODO:
+/// - Handle rewrites of phis. This must be more careful than normal about the
+///   reassignment. We do not want to introduce an AGPR-to-AGPR copy inside of a
+///   loop, so it depends on the exact assignment of the copy.
+///
 ///  - Update LiveIntervals incrementally instead of recomputing from scratch
 ///
 //===----------------------------------------------------------------------===//
@@ -22,6 +26,7 @@
 #include "GCNSubtarget.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/LiveRegMatrix.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -34,6 +39,9 @@ using namespace llvm;
 
 namespace {
 
+STATISTIC(NumMFMAsRewrittenToAGPR,
+          "Number of MFMA instructions rewritten to use AGPR form");
+
 class AMDGPURewriteAGPRCopyMFMAImpl {
   MachineFunction &MF;
   const GCNSubtarget &ST;
@@ -60,6 +68,25 @@ public:
     return TII.isMAI(MI) && AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode()) != -1;
   }
 
+  /// Find AV_* registers assigned to AGPRs (or virtual registers which were
+  /// already required to be AGPR).
+  ///
+  /// \return the assigned physical register that \p VReg is assigned to if it
+  /// is an AGPR, otherwise MCRegister().
+  MCRegister getAssignedAGPR(Register VReg) const {
+    MCRegister PhysReg = VRM.getPhys(VReg);
+    if (!PhysReg)
+      return MCRegister();
+
+    // If this is an AV register, we have to check if the actual assignment is
+    // to an AGPR
+    const TargetRegisterClass *AssignedRC = TRI.getPhysRegBaseClass(PhysReg);
+    return TRI.isAGPRClass(AssignedRC) ? PhysReg : MCRegister();
+  }
+
+  bool tryReassigningMFMAChain(MachineInstr &MFMA, Register MFMAHintReg,
+                               MCPhysReg PhysRegHint) const;
+
   /// Compute the register class constraints based on the uses of \p Reg,
   /// excluding MFMA uses from which can be rewritten to change the register
   /// class constraint. This should be nearly identical to
@@ -74,6 +101,8 @@ public:
       Register Reg, SmallVectorImpl<MachineInstr *> &RewriteCandidates,
       SmallSetVector<Register, 4> &RewriteRegs) const;
 
+  bool tryFoldCopiesToAGPR(Register VReg, MCRegister AssignedAGPR) const;
+  bool tryFoldCopiesFromAGPR(Register VReg, MCRegister AssignedAGPR) const;
   bool run(MachineFunction &MF) const;
 };
 
@@ -154,6 +183,88 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable(
   return true;
 }
 
+bool AMDGPURewriteAGPRCopyMFMAImpl::tryReassigningMFMAChain(
+    MachineInstr &MFMA, Register MFMAHintReg, MCPhysReg PhysRegHint) const {
+  // src2 and dst have the same physical class constraint; try to preserve
+  // the original src2 subclass if one were to exist.
+  SmallVector<MachineInstr *, 4> RewriteCandidates = {&MFMA};
+  SmallSetVector<Register, 4> RewriteRegs;
+
+  // Make sure we reassign the MFMA we found the copy from first. We want
+  // to ensure dst ends up in the physreg we were originally copying to.
+  RewriteRegs.insert(MFMAHintReg);
+
+  // We've found av = COPY (MFMA) (or MFMA (v = COPY av)) and need to verify
+  // that we can trivially rewrite src2 to use the new AGPR. If we can't
+  // trivially replace it, we're going to induce as many copies as we would have
+  // emitted in the first place, as well as need to assign another register, and
+  // need to figure out where to put them. The live range splitting is smarter
+  // than anything we're doing here, so trust it did something reasonable.
+  //
+  // Note recomputeRegClassExceptRewritable will consider the constraints of
+  // this MFMA's src2 as well as the src2/dst of any transitive MFMA users.
+  if (!recomputeRegClassExceptRewritable(MFMAHintReg, RewriteCandidates,
+                                         RewriteRegs)) {
+    LLVM_DEBUG(dbgs() << "Could not recompute the regclass of dst reg "
+                      << printReg(MFMAHintReg, &TRI) << '\n');
+    return false;
+  }
+
+  // If src2 and dst are different registers, we need to also reassign the
+  // input to an available AGPR if it is compatible with all other uses.
+  //
+  // If we can't reassign it, we'd need to introduce a different copy
+  // which is likely worse than the copy we'd be saving.
+  //
+  // It's likely that the MFMA is used in sequence with other MFMAs; if we
+  // cannot migrate the full use/def chain of MFMAs, we would need to
+  // introduce intermediate copies somewhere. So we only make the
+  // transform if all the interfering MFMAs can also be migrated. Collect
+  // the set of rewritable MFMAs and check if we can assign an AGPR at
+  // that point.
+  //
+  // If any of the MFMAs aren't reassignable, we give up and rollback to
+  // the original register assignments.
+
+  using RecoloringStack =
+      SmallVector<std::pair<const LiveInterval *, MCRegister>, 8>;
+  RecoloringStack TentativeReassignments;
+
+  for (Register RewriteReg : RewriteRegs) {
+    LiveInterval &LI = LIS.getInterval(RewriteReg);
+    TentativeReassignments.push_back({&LI, VRM.getPhys(RewriteReg)});
+    LRM.unassign(LI);
+  }
+
+  if (!attemptReassignmentsToAGPR(RewriteRegs, PhysRegHint)) {
+    // Roll back the register assignments to the original state.
+    for (auto [LI, OldAssign] : TentativeReassignments) {
+      if (VRM.hasPhys(LI->reg()))
+        LRM.unassign(*LI);
+      LRM.assign(*LI, OldAssign);
+    }
+
+    return false;
+  }
+
+  // Fixup the register classes of the virtual registers now that we've
+  // committed to the reassignments.
+  for (Register InterferingReg : RewriteRegs) {
+    const TargetRegisterClass *EquivalentAGPRRegClass =
+        TRI.getEquivalentAGPRClass(MRI.getRegClass(InterferingReg));
+    MRI.setRegClass(InterferingReg, EquivalentAGPRRegClass);
+  }
+
+  for (MachineInstr *RewriteCandidate : RewriteCandidates) {
+    int NewMFMAOp =
+        AMDGPU::getMFMASrcCVDstAGPROp(RewriteCandidate->getOpcode());
+    RewriteCandidate->setDesc(TII.get(NewMFMAOp));
+    ++NumMFMAsRewrittenToAGPR;
+  }
+
+  return true;
+}
+
 /// Attempt to reassign the registers in \p InterferingRegs to be AGPRs, with a
 /// preference to use \p PhysReg first. Returns false if the reassignments
 /// cannot be trivially performed.
@@ -206,140 +317,104 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::attemptReassignmentsToAGPR(
   return true;
 }
 
-bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
-  // This only applies on subtargets that have a configurable AGPR vs. VGPR
-  // allocation.
-  if (!ST.hasGFX90AInsts())
-    return false;
-
-  // Early exit if no AGPRs were assigned.
-  if (!LRM.isPhysRegUsed(AMDGPU::AGPR0)) {
-    LLVM_DEBUG(dbgs() << "skipping function that did not allocate AGPRs\n");
-    return false;
-  }
-
+/// Identify copies that look like:
+/// %vdst:vgpr = V_MFMA_.. %src0:av, %src1:av, %src2:vgpr
+/// %agpr = COPY %vgpr
+///
+/// Then try to replace the transitive uses of %src2 and %vdst with the AGPR
+/// versions of the MFMA. This should cover the common case.
+bool AMDGPURewriteAGPRCopyMFMAImpl::tryFoldCopiesToAGPR(
+    Register VReg, MCRegister AssignedAGPR) const {
   bool MadeChange = false;
-
-  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
-    Register VReg = Register::index2VirtReg(I);
-    Register PhysReg = VRM.getPhys(VReg);
-    if (!PhysReg)
+  for (MachineInstr &UseMI : MRI.def_instructions(VReg)) {
+    if (!UseMI.isCopy())
       continue;
 
-    // Find AV_* registers assigned to AGPRs.
-    const TargetRegisterClass *VirtRegRC = MRI.getRegClass(VReg);
-    if (!TRI.hasAGPRs(VirtRegRC))
+    Register CopySrcReg = UseMI.getOperand(1).getReg();
+    if (!CopySrcReg.isVirtual())
       continue;
 
-    const TargetRegisterClass *AssignedRC = VirtRegRC;
-    if (TRI.hasVGPRs(VirtRegRC)) {
-      // If this is an AV register, we have to check if the actual assignment is
-      // to an AGPR
-      AssignedRC = TRI.getPhysRegBaseClass(PhysReg);
-      if (!TRI.isAGPRClass(AssignedRC))
-        continue;
+    // TODO: Handle loop phis copied to AGPR. e.g.
+    //
+    // loop:
+    //   %phi:vgpr = COPY %mfma:vgpr
+    //   %mfma:vgpr = V_MFMA_xxx_vgprcd_e64 %a, %b, %phi
+    //   s_cbranch_vccnz loop
+    //
+    // endloop:
+    //   %agpr = mfma
+    //
+    // We need to be sure that %phi is assigned to the same physical register as
+    // %mfma, or else we will just be moving copies into the loop.
+
+    for (MachineInstr &CopySrcDefMI : MRI.def_instructions(CopySrcReg)) {
+      if (isRewriteCandidate(CopySrcDefMI) &&
+          tryReassigningMFMAChain(
+              CopySrcDefMI, CopySrcDefMI.getOperand(0).getReg(), AssignedAGPR))
+        MadeChange = true;
     }
+  }
 
-    LiveInterval &LI = LIS.getInterval(VReg);
-
-    for (VNInfo *VNI : LI.vnis()) {
-      if (VNI->isPHIDef() || VNI->isUnused())
-        continue;
-
-      MachineInstr *DefMI = LIS.getInstructionFromIndex(VNI->def);
-      if (!DefMI || !DefMI->isCopy())
-        continue;
+  return MadeChange;
+}
 
-      Register MFMADstReg = DefMI->getOperand(1).getReg();
-      if (!MFMADstReg.isVirtual())
-        continue;
+/// Identify copies that look like:
+/// %src:vgpr = COPY %src:agpr
+/// %vdst:vgpr = V_MFMA_... %src0:av, %src1:av, %src:vgpr
+///
+/// Then try to replace the transitive uses of %src2 and %vdst with the AGPR
+/// versions of the MFMA. This should cover rarer cases, and will generally be
+/// redundant with tryFoldCopiesToAGPR.
+bool AMDGPURewriteAGPRCopyMFMAImpl::tryFoldCopiesFromAGPR(
+    Register VReg, MCRegister AssignedAGPR) const {
+  bool MadeChange = false;
+  for (MachineInstr &UseMI : MRI.use_instructions(VReg)) {
+    if (!UseMI.isCopy())
+      continue;
 
-      LiveInterval &CopySrcLI = LIS.getInterval(MFMADstReg);
-      LiveQueryResult LRQ = CopySrcLI.Query(VNI->def.getRegSlot());
-      MachineInstr *MFMA = LIS.getInstructionFromIndex(LRQ.valueIn()->def);
-      if (!MFMA || !isRewriteCandidate(*MFMA))
+    Register CopyDstReg = UseMI.getOperand(0).getReg();
+    if (!CopyDstReg.isVirtual())
+      continue;
+    for (MachineOperand &CopyUseMO : MRI.reg_nodbg_operands(CopyDstReg)) {
+      if (!CopyUseMO.readsReg())
         continue;
 
-      // src2 and dst have the same physical class constraint; try to preserve
-      // the original src2 subclass if one were to exist.
-      SmallVector<MachineInstr *, 4> RewriteCandidates = {MFMA};
-      SmallSetVector<Register, 4> RewriteRegs;
-
-      // Make sure we reassign the MFMA we found the copy from first. We want
-      // to ensure dst ends up in the physreg we were originally copying to.
-      RewriteRegs.insert(MFMADstReg);
-
-      // We've found av = COPY (MFMA), and need to verify that we can trivially
-      // rewrite src2 to use the new AGPR. If we can't trivially replace it,
-      // we're going to induce as many copies as we would have emitted in the
-      // first place, as well as need to assign another register, and need to
-      // figure out where to put them. The live range splitting is smarter than
-      // anything we're doing here, so trust it did something reasonable.
-      //
-      // Note recomputeRegClassExceptRewritable will consider the constraints of
-      // this MFMA's src2 as well as the src2/dst of any transitive MFMA users.
-      if (!recomputeRegClassExceptRewritable(MFMADstReg, RewriteCandidates,
-                                             RewriteRegs)) {
-        LLVM_DEBUG(dbgs() << "Could not recompute the regclass of dst reg "
-                          << printReg(MFMADstReg, &TRI) << '\n');
-        continue;
+      MachineInstr &CopyUseMI = *CopyUseMO.getParent();
+      if (isRewriteCandidate(CopyUseMI)) {
+        if (tryReassigningMFMAChain(CopyUseMI, CopyDstReg,
+                                    VRM.getPhys(CopyDstReg)))
+          MadeChange = true;
       }
+    }
+  }
 
-      // If src2 and dst are different registers, we need to also reassign the
-      // input to an available AGPR if it is compatible with all other uses.
-      //
-      // If we can't reassign it, we'd need to introduce a different copy
-      // which is likely worse than the copy we'd be saving.
-      //
-      // It's likely that the MFMA is used in sequence with other MFMAs; if we
-      // cannot migrate the full use/def chain of MFMAs, we would need to
-      // introduce intermediate copies somewhere. So we only make the
-      // transform if all the interfering MFMAs can also be migrated. Collect
-      // the set of rewritable MFMAs and check if we can assign an AGPR at
-      // that point.
-      //
-      // If any of the MFMAs aren't reassignable, we give up and rollback to
-      // the original register assignments.
-
-      using RecoloringStack =
-          SmallVector<std::pair<const LiveInterval *, MCRegister>, 8>;
-      RecoloringStack TentativeReassignments;
-
-      for (Register RewriteReg : RewriteRegs) {
-        LiveInterval &LI = LIS.getInterval(RewriteReg);
-        TentativeReassignments.push_back({&LI, VRM.getPhys(RewriteReg)});
-        LRM.unassign(LI);
-      }
+  return MadeChange;
+}
 
-      if (!attemptReassignmentsToAGPR(RewriteRegs, PhysReg)) {
-        // Roll back the register assignments to the original state.
-        for (auto [LI, OldAssign] : TentativeReassignments) {
-          if (VRM.hasPhys(LI->reg()))
-            LRM.unassign(*LI);
-          LRM.assign(*LI, OldAssign);
-        }
+bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
+  // This only applies on subtargets that have a configurable AGPR vs. VGPR
+  // allocation.
+  if (!ST.hasGFX90AInsts())
+    return false;
 
-        continue;
-      }
+  // Early exit if no AGPRs were assigned.
+  if (!LRM.isPhysRegUsed(AMDGPU::AGPR0)) {
+    LLVM_DEBUG(dbgs() << "skipping function that did not allocate AGPRs\n");
+    return false;
+  }
 
-      // Fixup the register classes of the virtual registers now that we've
-      // committed to the reassignments.
-      for (Register InterferingReg : RewriteRegs) {
-        const TargetRegisterClass *EquivalentAGPRRegClass =
-            TRI.getEquivalentAGPRClass(MRI.getRegClass(InterferingReg));
-        MRI.setRegClass(InterferingReg, EquivalentAGPRRegClass);
-      }
+  bool MadeChange = false;
 
-      for (MachineInstr *RewriteCandidate : RewriteCandidates) {
-        int NewMFMAOp =
-            AMDGPU::getMFMASrcCVDstAGPROp(RewriteCandidate->getOpcode());
-        RewriteCandidate->setDesc(TII.get(NewMFMAOp));
-      }
+  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+    Register VReg = Register::index2VirtReg(I);
+    MCRegister AssignedAGPR = getAssignedAGPR(VReg);
+    if (!AssignedAGPR)
+      continue;
 
-      // We likely left an identity copy behind after assignment; let
-      // VirtRegRewriter deal with it later.
+    if (tryFoldCopiesToAGPR(VReg, AssignedAGPR))
+      MadeChange = true;
+    if (tryFoldCopiesFromAGPR(VReg, AssignedAGPR))
       MadeChange = true;
-    }
   }
 
   return MadeChange;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index d095fc6cf954..73acb1ddbd2a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -210,18 +210,10 @@ AMDGPUSubtarget::getWavesPerEU(const Function &F) const {
   // Default/requested minimum/maximum flat work group sizes.
   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
   // Minimum number of bytes allocated in the LDS.
-  unsigned LDSBytes = AMDGPU::getIntegerPairAttribute(F, "amdgpu-lds-size",
-                                                      {0, UINT32_MAX}, true)
-                          .first;
-  return getWavesPerEU(FlatWorkGroupSizes, LDSBytes, F);
-}
-
-std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
-    const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
-  // Minimum number of bytes allocated in the LDS.
-  unsigned LDSBytes = AMDGPU::getIntegerPairAttribute(F, "amdgpu-lds-size",
-                                                      {0, UINT32_MAX}, true)
-                          .first;
+  unsigned LDSBytes =
+      AMDGPU::getIntegerPairAttribute(F, "amdgpu-lds-size", {0, UINT32_MAX},
+                                      /*OnlyFirstRequired=*/true)
+          .first;
   return getWavesPerEU(FlatWorkGroupSizes, LDSBytes, F);
 }
 
@@ -237,11 +229,31 @@ AMDGPUSubtarget::getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
   return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes, LDSBytes);
 }
 
-static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
+std::optional<unsigned>
+AMDGPUSubtarget::getReqdWorkGroupSize(const Function &Kernel,
+                                      unsigned Dim) const {
   auto *Node = Kernel.getMetadata("reqd_work_group_size");
   if (Node && Node->getNumOperands() == 3)
     return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
-  return std::numeric_limits<unsigned>::max();
+  return std::nullopt;
+}
+
+bool AMDGPUSubtarget::hasWavefrontsEvenlySplittingXDim(
+    const Function &F, bool RequiresUniformYZ) const {
+  auto *Node = F.getMetadata("reqd_work_group_size");
+  if (!Node || Node->getNumOperands() != 3)
+    return false;
+  unsigned XLen =
+      mdconst::extract<ConstantInt>(Node->getOperand(0))->getZExtValue();
+  unsigned YLen =
+      mdconst::extract<ConstantInt>(Node->getOperand(1))->getZExtValue();
+  unsigned ZLen =
+      mdconst::extract<ConstantInt>(Node->getOperand(2))->getZExtValue();
+
+  bool Is1D = YLen <= 1 && ZLen <= 1;
+  bool IsXLargeEnough =
+      isPowerOf2_32(XLen) && (!RequiresUniformYZ || XLen >= getWavefrontSize());
+  return Is1D || IsXLargeEnough;
 }
 
 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
@@ -250,9 +262,9 @@ bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
 
 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
                                            unsigned Dimension) const {
-  unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
-  if (ReqdSize != std::numeric_limits<unsigned>::max())
-    return ReqdSize - 1;
+  std::optional<unsigned> ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
+  if (ReqdSize)
+    return *ReqdSize - 1;
   return getFlatWorkGroupSizes(Kernel).second - 1;
 }
 
@@ -303,9 +315,9 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
       }
 
       if (Dim <= 3) {
-        unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
-        if (ReqdSize != std::numeric_limits<unsigned>::max())
-          MinSize = MaxSize = ReqdSize;
+        std::optional<unsigned> ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
+        if (ReqdSize)
+          MinSize = MaxSize = *ReqdSize;
       }
     }
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 6878744496cf..57b757c990e1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -100,6 +100,26 @@ public:
   /// be converted to integer, or violate subtarget's specifications.
   std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
 
+  /// \returns The required size of workgroups that will be used to execute \p F
+  /// in the \p Dim dimension, if it is known (from `!reqd_work_group_size`
+  /// metadata. Otherwise, returns std::nullopt.
+  std::optional<unsigned> getReqdWorkGroupSize(const Function &F,
+                                               unsigned Dim) const;
+
+  /// \returns true if \p F will execute in a manner that leaves the X
+  /// dimensions of the workitem ID evenly tiling wavefronts - that is, if X /
+  /// wavefrontsize is uniform. This is true if either the Y and Z block
+  /// dimensions are known to always be 1 or if the X dimension will always be a
+  /// power of 2. If \p RequireUniformYZ is true, it also ensures that the Y and
+  /// Z workitem IDs will be uniform (so, while a (32, 2, 1) launch with
+  /// wavesize64 would ordinarily pass this test, it won't with
+  /// \pRequiresUniformYZ).
+  ///
+  /// This information is currently only gathered from the !reqd_work_group_size
+  /// metadata on \p F, but this may be improved in the future.
+  bool hasWavefrontsEvenlySplittingXDim(const Function &F,
+                                        bool REquiresUniformYZ = false) const;
+
   /// \returns Subtarget's default pair of minimum/maximum number of waves per
   /// execution unit for function \p F, or minimum/maximum number of waves per
   /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index e969f9ec8889..9afe7590fe4e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -22,6 +22,7 @@
 #include "AMDGPUExportKernelRuntimeHandles.h"
 #include "AMDGPUIGroupLP.h"
 #include "AMDGPUISelDAGToDAG.h"
+#include "AMDGPULowerVGPREncoding.h"
 #include "AMDGPUMacroFusion.h"
 #include "AMDGPUPerfHintAnalysis.h"
 #include "AMDGPUPreloadKernArgProlog.h"
@@ -577,12 +578,14 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPURemoveIncompatibleFunctionsLegacyPass(*PR);
   initializeAMDGPULowerModuleLDSLegacyPass(*PR);
   initializeAMDGPULowerBufferFatPointersPass(*PR);
+  initializeAMDGPULowerIntrinsicsLegacyPass(*PR);
   initializeAMDGPUReserveWWMRegsLegacyPass(*PR);
   initializeAMDGPURewriteAGPRCopyMFMALegacyPass(*PR);
   initializeAMDGPURewriteOutArgumentsPass(*PR);
   initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
   initializeSIAnnotateControlFlowLegacyPass(*PR);
   initializeAMDGPUInsertDelayAluLegacyPass(*PR);
+  initializeAMDGPULowerVGPREncodingLegacyPass(*PR);
   initializeSIInsertHardClausesLegacyPass(*PR);
   initializeSIInsertWaitcntsLegacyPass(*PR);
   initializeSIModeRegisterLegacyPass(*PR);
@@ -1418,6 +1421,7 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
     // nodes out of the graph, which leads to function-level passes not
     // being run on them, which causes crashes in the resource usage analysis).
     addPass(createAMDGPULowerBufferFatPointersPass());
+    addPass(createAMDGPULowerIntrinsicsLegacyPass());
     // In accordance with the above FIXME, manually force all the
     // function-level passes into a CGSCCPassManager.
     addPass(new DummyCGSCCPass());
@@ -1797,6 +1801,8 @@ void GCNPassConfig::addPreEmitPass() {
 
   addPass(&AMDGPUWaitSGPRHazardsLegacyID);
 
+  addPass(&AMDGPULowerVGPREncodingLegacyID);
+
   if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less))
     addPass(&AMDGPUInsertDelayAluID);
 
@@ -2155,9 +2161,10 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
   // nodes out of the graph, which leads to function-level passes not
   // being run on them, which causes crashes in the resource usage analysis).
   addPass(AMDGPULowerBufferFatPointersPass(TM));
-
   addPass.requireCGSCCOrder();
 
+  addPass(AMDGPULowerIntrinsicsPass(TM));
+
   Base::addCodeGenPrepare(addPass);
 
   if (isPassEnabled(EnableLoadStoreVectorizer))
@@ -2383,6 +2390,7 @@ void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const {
   // cases.
   addPass(PostRAHazardRecognizerPass());
   addPass(AMDGPUWaitSGPRHazardsPass());
+  addPass(AMDGPULowerVGPREncodingPass());
 
   if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less)) {
     addPass(AMDGPUInsertDelayAluPass());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 846a0b6280f1..3e2b2c351056 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/Analysis.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/PatternMatch.h"
@@ -1003,6 +1004,15 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
              DstAS == AMDGPUAS::FLAT_ADDRESS &&
              ST->hasGloballyAddressableScratch();
     }
+    case Intrinsic::amdgcn_workitem_id_y:
+    case Intrinsic::amdgcn_workitem_id_z: {
+      const Function *F = Intrinsic->getFunction();
+      bool HasUniformYZ =
+          ST->hasWavefrontsEvenlySplittingXDim(*F, /*RequitezUniformYZ=*/true);
+      std::optional<unsigned> ThisDimSize = ST->getReqdWorkGroupSize(
+          *F, IID == Intrinsic::amdgcn_workitem_id_y ? 1 : 2);
+      return !HasUniformYZ && (!ThisDimSize || *ThisDimSize != 1);
+    }
     default:
       return AMDGPU::isIntrinsicSourceOfDivergence(IID);
     }
@@ -1049,28 +1059,31 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
   // packed into a same wave which gives 1 and 0 after the division by 64
   // respectively.
   //
-  // FIXME: limit it to 1D kernels only, although that shall be possible
-  // to perform this optimization is the size of the X dimension is a power
-  // of 2, we just do not currently have infrastructure to query it.
+  // The X dimension doesn't reset within a wave if either both the Y
+  // and Z dimensions are of length 1, or if the X dimension's required
+  // size is a power of 2. Note, however, if the X dimension's maximum
+  // size is a power of 2 < the wavefront size, division by the wavefront
+  // size is guaranteed to yield 0, so this is also a no-reset case.
+  bool XDimDoesntResetWithinWaves = false;
+  if (auto *I = dyn_cast<Instruction>(V)) {
+    const Function *F = I->getFunction();
+    XDimDoesntResetWithinWaves = ST->hasWavefrontsEvenlySplittingXDim(*F);
+  }
   using namespace llvm::PatternMatch;
   uint64_t C;
   if (match(V, m_LShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
                       m_ConstantInt(C))) ||
       match(V, m_AShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
                       m_ConstantInt(C)))) {
-    const Function *F = cast<Instruction>(V)->getFunction();
-    return C >= ST->getWavefrontSizeLog2() &&
-           ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
+    return C >= ST->getWavefrontSizeLog2() && XDimDoesntResetWithinWaves;
   }
 
   Value *Mask;
   if (match(V, m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
                        m_Value(Mask)))) {
-    const Function *F = cast<Instruction>(V)->getFunction();
-    const DataLayout &DL = F->getDataLayout();
     return computeKnownBits(Mask, DL).countMinTrailingZeros() >=
                ST->getWavefrontSizeLog2() &&
-           ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
+           XDimDoesntResetWithinWaves;
   }
 
   const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 2e21ba4c30b5..e420f2ad676f 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -564,6 +564,14 @@ public:
     return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::i32);
   }
 
+  bool isVCSrc_b32_Lo256() const {
+    return isRegOrInlineNoMods(AMDGPU::VS_32_Lo256RegClassID, MVT::i32);
+  }
+
+  bool isVCSrc_b64_Lo256() const {
+    return isRegOrInlineNoMods(AMDGPU::VS_64_Lo256RegClassID, MVT::i64);
+  }
+
   bool isVCSrc_b64() const {
     return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::i64);
   }
@@ -1007,7 +1015,7 @@ public:
   bool isEndpgm() const;
 
   auto getPredicate(std::function<bool(const AMDGPUOperand &Op)> P) const {
-    return [=](){ return P(*this); };
+    return [this, P]() { return P(*this); };
   }
 
   StringRef getToken() const {
@@ -1886,6 +1894,7 @@ private:
   bool validateTHAndScopeBits(const MCInst &Inst, const OperandVector &Operands,
                               const unsigned CPol);
   bool validateTFE(const MCInst &Inst, const OperandVector &Operands);
+  bool validateSetVgprMSB(const MCInst &Inst, const OperandVector &Operands);
   std::optional<StringRef> validateLdsDirect(const MCInst &Inst);
   bool validateWMMA(const MCInst &Inst, const OperandVector &Operands);
   unsigned getConstantBusLimit(unsigned Opcode) const;
@@ -2985,7 +2994,12 @@ MCRegister AMDGPUAsmParser::getRegularReg(RegisterKind RegKind, unsigned RegNum,
 
   const MCRegisterInfo *TRI = getContext().getRegisterInfo();
   const MCRegisterClass RC = TRI->getRegClass(RCID);
-  if (RegIdx >= RC.getNumRegs()) {
+  if (RegIdx >= RC.getNumRegs() || (RegKind == IS_VGPR && RegIdx > 255)) {
+    Error(Loc, "register index is out of range");
+    return AMDGPU::NoRegister;
+  }
+
+  if (RegKind == IS_VGPR && !isGFX1250() && RegIdx + RegWidth / 32 > 256) {
     Error(Loc, "register index is out of range");
     return MCRegister();
   }
@@ -4768,12 +4782,14 @@ bool AMDGPUAsmParser::validateOffset(const MCInst &Inst,
     return validateSMEMOffset(Inst, Operands);
 
   const auto &Op = Inst.getOperand(OpNum);
+  // GFX12+ buffer ops: InstOffset is signed 24, but must not be a negative.
   if (isGFX12Plus() &&
       (TSFlags & (SIInstrFlags::MUBUF | SIInstrFlags::MTBUF))) {
     const unsigned OffsetSize = 24;
-    if (!isIntN(OffsetSize, Op.getImm())) {
+    if (!isUIntN(OffsetSize - 1, Op.getImm())) {
       Error(getFlatOffsetLoc(Operands),
-            Twine("expected a ") + Twine(OffsetSize) + "-bit signed offset");
+            Twine("expected a ") + Twine(OffsetSize - 1) +
+                "-bit unsigned offset for buffer ops");
       return false;
     }
   } else {
@@ -4856,7 +4872,9 @@ bool AMDGPUAsmParser::validateSMEMOffset(const MCInst &Inst,
     return true;
 
   Error(getSMEMOffsetLoc(Operands),
-        isGFX12Plus()          ? "expected a 24-bit signed offset"
+        isGFX12Plus() && IsBuffer
+            ? "expected a 23-bit unsigned offset for buffer ops"
+        : isGFX12Plus()        ? "expected a 24-bit signed offset"
         : (isVI() || IsBuffer) ? "expected a 20-bit unsigned offset"
                                : "expected a 21-bit signed offset");
 
@@ -5216,7 +5234,7 @@ bool AMDGPUAsmParser::validateAGPRLdSt(const MCInst &Inst) const {
 
 bool AMDGPUAsmParser::validateVGPRAlign(const MCInst &Inst) const {
   auto FB = getFeatureBits();
-  if (!FB[AMDGPU::FeatureGFX90AInsts] && !FB[AMDGPU::FeatureGFX1250Insts])
+  if (!FB[AMDGPU::FeatureRequiresAlignedVGPRs])
     return true;
 
   unsigned Opc = Inst.getOpcode();
@@ -5542,6 +5560,22 @@ bool AMDGPUAsmParser::validateTFE(const MCInst &Inst,
   return true;
 }
 
+bool AMDGPUAsmParser::validateSetVgprMSB(const MCInst &Inst,
+                                         const OperandVector &Operands) {
+  if (Inst.getOpcode() != AMDGPU::S_SET_VGPR_MSB_gfx12)
+    return true;
+
+  int Simm16Pos =
+      AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::simm16);
+  if ((unsigned)Inst.getOperand(Simm16Pos).getImm() > 255) {
+    SMLoc Loc = Operands[1]->getStartLoc();
+    Error(Loc, "s_set_vgpr_msb accepts values in range [0..255]");
+    return false;
+  }
+
+  return true;
+}
+
 bool AMDGPUAsmParser::validateWMMA(const MCInst &Inst,
                                    const OperandVector &Operands) {
   unsigned Opc = Inst.getOpcode();
@@ -5706,6 +5740,9 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
   if (!validateTFE(Inst, Operands)) {
     return false;
   }
+  if (!validateSetVgprMSB(Inst, Operands)) {
+    return false;
+  }
   if (!validateWMMA(Inst, Operands)) {
     return false;
   }
@@ -5799,6 +5836,7 @@ bool AMDGPUAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                               uint64_t &ErrorInfo,
                                               bool MatchingInlineAsm) {
   MCInst Inst;
+  Inst.setLoc(IDLoc);
   unsigned Result = Match_Success;
   for (auto Variant : getMatchedVariants()) {
     uint64_t EI;
@@ -5822,7 +5860,6 @@ bool AMDGPUAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     if (!validateInstruction(Inst, IDLoc, Operands)) {
       return true;
     }
-    Inst.setLoc(IDLoc);
     Out.emitInstruction(Inst, getSTI());
     return false;
   }
@@ -6144,12 +6181,6 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
                        ExprVal, ValRange);
       if (Val)
         ImpliedUserSGPRCount += 1;
-    } else if (ID == ".amdhsa_uses_cu_stores") {
-      if (!isGFX1250())
-        return Error(IDRange.Start, "directive requires gfx12.5", IDRange);
-
-      PARSE_BITS_ENTRY(KD.kernel_code_properties,
-                       KERNEL_CODE_PROPERTY_USES_CU_STORES, ExprVal, ValRange);
     } else if (ID == ".amdhsa_wavefront_size32") {
       EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       if (IVersion.Major < 10)
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 1956a15c57d6..f229298ba516 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -132,7 +132,6 @@ class MTBUF_Real <MTBUF_Pseudo ps, string real_name = ps.Mnemonic> :
   let OtherPredicates    = ps.OtherPredicates;
   let AsmMatchConverter  = ps.AsmMatchConverter;
   let Constraints        = ps.Constraints;
-  let DisableEncoding    = ps.DisableEncoding;
   let TSFlags            = ps.TSFlags;
   let SchedRW            = ps.SchedRW;
   let mayLoad            = ps.mayLoad;
@@ -159,11 +158,10 @@ class MTBUF_Real <MTBUF_Pseudo ps, string real_name = ps.Mnemonic> :
   bits<1> acc = !if(ps.has_vdata, vdata{9}, 0);
 }
 
-class getMTBUFInsDA<list<RegisterClass> vdataList,
+class getMTBUFInsDA<list<RegisterOperand> vdataList,
                     list<RegisterClass> vaddrList=[], bit hasRestrictedSOffset> {
-  RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList));
+  RegisterOperand vdata_op = !if(!empty(vdataList), ?, !head(vdataList));
   RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
-  RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret;
 
   dag SOffset = !if(hasRestrictedSOffset, (ins SReg_32:$soffset),
                                  (ins SCSrc_b32:$soffset));
@@ -179,7 +177,7 @@ class getMTBUFInsDA<list<RegisterClass> vdataList,
                 !con((ins vdata_op:$vdata), Inputs));
 }
 
-class getMTBUFIns<int addrKind, list<RegisterClass> vdataList=[], bit hasRestrictedSOffset> {
+class getMTBUFIns<int addrKind, list<RegisterOperand> vdataList=[], bit hasRestrictedSOffset> {
   dag ret =
     !if(!eq(addrKind, BUFAddrKind.Offset), getMTBUFInsDA<vdataList, [], hasRestrictedSOffset>.ret,
     !if(!eq(addrKind, BUFAddrKind.OffEn),  getMTBUFInsDA<vdataList, [VGPR_32], hasRestrictedSOffset>.ret,
@@ -218,25 +216,23 @@ class MTBUF_SetupAddr<int addrKind> {
 
 class MTBUF_Load_Pseudo <string opName,
                          int addrKind,
-                         RegisterClass vdataClass,
+                         RegisterOperand vdataClass,
                          int elems,
                          bit hasRestrictedSOffset = 0,
-                         list<dag> pattern=[],
-                         // Workaround bug bz30254
-                         int addrKindCopy = addrKind>
+                         list<dag> pattern=[]>
   : MTBUF_Pseudo<opName,
-                 (outs getLdStRegisterOperand<vdataClass>.ret:$vdata),
-                 getMTBUFIns<addrKindCopy, [], hasRestrictedSOffset>.ret,
-                 getMTBUFAsmOps<addrKindCopy>.ret,
+                 (outs vdataClass:$vdata),
+                 getMTBUFIns<addrKind, [], hasRestrictedSOffset>.ret,
+                 getMTBUFAsmOps<addrKind>.ret,
                  pattern>,
-    MTBUF_SetupAddr<addrKindCopy> {
-  let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
+    MTBUF_SetupAddr<addrKind> {
+  let PseudoInstr = opName # "_" # getAddrName<addrKind>.ret;
   let mayLoad = 1;
   let mayStore = 0;
   let elements = elems;
 }
 
-multiclass MTBUF_Pseudo_Loads_Helper<string opName, RegisterClass vdataClass,
+multiclass MTBUF_Pseudo_Loads_Helper<string opName, RegisterOperand vdataClass,
                               int elems, bit hasRestrictedSOffset> {
 
   def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems, hasRestrictedSOffset>,
@@ -257,7 +253,7 @@ multiclass MTBUF_Pseudo_Loads_Helper<string opName, RegisterClass vdataClass,
   }
 }
 
-multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
+multiclass MTBUF_Pseudo_Loads<string opName, RegisterOperand vdataClass,
                               int elems> {
   defm NAME : MTBUF_Pseudo_Loads_Helper<opName, vdataClass, elems, 0>;
   defm _VBUFFER : MTBUF_Pseudo_Loads_Helper<opName, vdataClass, elems, 1>;
@@ -265,26 +261,23 @@ multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
 
 class MTBUF_Store_Pseudo <string opName,
                           int addrKind,
-                          RegisterClass vdataClass,
+                          RegisterOperand vdataClass,
                           int elems,
                           bit hasRestrictedSOffset = 0,
-                          list<dag> pattern=[],
-                          // Workaround bug bz30254
-                          int addrKindCopy = addrKind,
-                          RegisterClass vdataClassCopy = vdataClass>
+                          list<dag> pattern=[]>
   : MTBUF_Pseudo<opName,
                  (outs),
-                 getMTBUFIns<addrKindCopy, [vdataClassCopy], hasRestrictedSOffset>.ret,
-                 getMTBUFAsmOps<addrKindCopy>.ret,
+                 getMTBUFIns<addrKind, [vdataClass], hasRestrictedSOffset>.ret,
+                 getMTBUFAsmOps<addrKind>.ret,
                  pattern>,
-    MTBUF_SetupAddr<addrKindCopy> {
-  let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
+    MTBUF_SetupAddr<addrKind> {
+  let PseudoInstr = opName # "_" # getAddrName<addrKind>.ret;
   let mayLoad = 0;
   let mayStore = 1;
   let elements = elems;
 }
 
-multiclass MTBUF_Pseudo_Stores_Helper<string opName, RegisterClass vdataClass,
+multiclass MTBUF_Pseudo_Stores_Helper<string opName, RegisterOperand vdataClass,
                                int elems, bit hasRestrictedSOffset> {
 
   def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems, hasRestrictedSOffset>,
@@ -305,7 +298,7 @@ multiclass MTBUF_Pseudo_Stores_Helper<string opName, RegisterClass vdataClass,
   }
 }
 
-multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
+multiclass MTBUF_Pseudo_Stores<string opName, RegisterOperand vdataClass,
                                int elems> {
   defm NAME : MTBUF_Pseudo_Stores_Helper<opName, vdataClass, elems, 0>;
   defm _VBUFFER : MTBUF_Pseudo_Stores_Helper<opName, vdataClass, elems, 1>;
@@ -346,7 +339,6 @@ class MUBUF_Real <MUBUF_Pseudo ps, string real_name = ps.Mnemonic> :
   let AsmMatchConverter    = ps.AsmMatchConverter;
   let OtherPredicates      = ps.OtherPredicates;
   let Constraints          = ps.Constraints;
-  let DisableEncoding      = ps.DisableEncoding;
   let TSFlags              = ps.TSFlags;
   let UseNamedOperandTable = ps.UseNamedOperandTable;
   let SchedRW              = ps.SchedRW;
@@ -401,21 +393,29 @@ class MUBUF_Invalidate <string opName, SDPatternOperator node = null_frag> :
   let sccb_value  = 0;
 }
 
-class getLdStVDataRegisterOperand<RegisterClass RC, bit isTFE> {
-  RegisterOperand tfeVDataOp =
-    !cond(!eq(RC.Size, 32)  : AVLdSt_64,
-          !eq(RC.Size, 64)  : AVLdSt_96,
-          !eq(RC.Size, 96)  : AVLdSt_128,
-          !eq(RC.Size, 128) : AVLdSt_160);
+class getBUFVDataRegisterOperand<int Size, bit isTFE> {
+  defvar tfeVDataOp =
+    !cond(!eq(Size, 16)  : AVLdSt_64,
+          !eq(Size, 32)  : AVLdSt_64,
+          !eq(Size, 64)  : AVLdSt_96,
+          !eq(Size, 96)  : AVLdSt_128,
+          !eq(Size, 128) : AVLdSt_160);
+
+  defvar VDataOp =
+    !cond(!eq(Size, 16)   : AVLdSt_32,
+          !eq(Size, 32)   : AVLdSt_32,
+          !eq(Size, 64)   : AVLdSt_64,
+          !eq(Size, 96)   : AVLdSt_96,
+          !eq(Size, 128)  : AVLdSt_128);
 
-  RegisterOperand ret = !if(isTFE, tfeVDataOp, getLdStRegisterOperand<RC>.ret);
+  RegisterOperand ret = !if(isTFE, tfeVDataOp, VDataOp);
 }
 
-class getMUBUFInsDA<list<RegisterClass> vdataList,
+class getMUBUFInsDA<list<RegisterOperand> vdataList,
                     list<RegisterClass> vaddrList, bit isTFE, bit hasRestrictedSOffset> {
-  RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList));
+  RegisterOperand vdataClass = !if(!empty(vdataList), ?, !head(vdataList));
   RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
-  RegisterOperand vdata_op = getLdStVDataRegisterOperand<vdataClass, isTFE>.ret;
+  RegisterOperand vdata_op = getBUFVDataRegisterOperand<vdataClass.RegClass.Size, isTFE>.ret;
 
   dag SOffset = !if(hasRestrictedSOffset, (ins SReg_32:$soffset), (ins SCSrc_b32:$soffset));
   dag NonVaddrInputs = !con((ins SReg_128_XNULL:$srsrc), SOffset, (ins Offset:$offset, CPol_0:$cpol, i1imm_0:$swz));
@@ -443,7 +443,7 @@ class getMUBUFElements<ValueType vt> {
     );
 }
 
-class getMUBUFIns<int addrKind, list<RegisterClass> vdataList, bit isTFE, bit hasRestrictedSOffset> {
+class getMUBUFIns<int addrKind, list<RegisterOperand> vdataList, bit isTFE, bit hasRestrictedSOffset> {
   dag ret =
     !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList, [], isTFE, hasRestrictedSOffset>.ret,
     !if(!eq(addrKind, BUFAddrKind.OffEn),  getMUBUFInsDA<vdataList, [VGPR_32], isTFE, hasRestrictedSOffset>.ret,
@@ -491,19 +491,16 @@ class MUBUF_Load_Pseudo <string opName,
                          bit isTFE = 0,
                          bit hasRestrictedSOffset = 0,
                          list<dag> pattern=[],
-                         // Workaround bug bz30254
-                         int addrKindCopy = addrKind,
-                         RegisterClass vdata_rc = getVregSrcForVT<vdata_vt>.ret.RegClass,
-                         RegisterOperand vdata_op = getLdStVDataRegisterOperand<vdata_rc, isTFE>.ret>
+                         RegisterOperand vdata_op = getBUFVDataRegisterOperand<vdata_vt.Size, isTFE>.ret>
   : MUBUF_Pseudo<opName,
                  !if(!or(isLds, isLdsOpc), (outs), (outs vdata_op:$vdata)),
-                 !con(getMUBUFIns<addrKindCopy, [], isTFE, hasRestrictedSOffset>.ret,
+                 !con(getMUBUFIns<addrKind, [], isTFE, hasRestrictedSOffset>.ret,
                       !if(HasTiedDest, (ins vdata_op:$vdata_in), (ins))),
-                 getMUBUFAsmOps<addrKindCopy, !or(isLds, isLdsOpc), isLds, isTFE>.ret,
+                 getMUBUFAsmOps<addrKind, !or(isLds, isLdsOpc), isLds, isTFE>.ret,
                  pattern>,
-    MUBUF_SetupAddr<addrKindCopy> {
+    MUBUF_SetupAddr<addrKind> {
   let PseudoInstr = opName # !if(isLds, "_lds", "") # !if(isTFE, "_tfe", "") #
-                    "_" # getAddrName<addrKindCopy>.ret;
+                    "_" # getAddrName<addrKind>.ret;
   let AsmMatchConverter = "cvtMubuf";
 
   let Constraints = !if(HasTiedDest, "$vdata = $vdata_in", "");
@@ -593,17 +590,15 @@ class MUBUF_Store_Pseudo <string opName,
                           ValueType store_vt,
                           bit isTFE = 0,
                           bit hasRestrictedSOffset = 0,
-                          list<dag> pattern=[],
-                          // Workaround bug bz30254
-                          int addrKindCopy = addrKind>
+                          list<dag> pattern=[]>
   : MUBUF_Pseudo<opName,
                  (outs),
-                 getMUBUFIns<addrKindCopy, [getVregSrcForVT<store_vt>.ret.RegClass], isTFE, hasRestrictedSOffset>.ret,
-                 getMUBUFAsmOps<addrKindCopy, 0, 0, isTFE>.ret,
+                 getMUBUFIns<addrKind, [getVregSrcForVT<store_vt>.ret], isTFE, hasRestrictedSOffset>.ret,
+                 getMUBUFAsmOps<addrKind, 0, 0, isTFE>.ret,
                  pattern>,
-    MUBUF_SetupAddr<addrKindCopy> {
+    MUBUF_SetupAddr<addrKind> {
   let PseudoInstr = opName # "_" # !if(isTFE, "_tfe", "") #
-                    getAddrName<addrKindCopy>.ret;
+                    getAddrName<addrKind>.ret;
   let mayLoad = 0;
   let mayStore = 1;
   let elements = getMUBUFElements<store_vt>.ret;
@@ -676,10 +671,9 @@ class MUBUF_Pseudo_Store_Lds<string opName>
   let AsmMatchConverter = "cvtMubuf";
 }
 
-class getMUBUFAtomicInsDA<RegisterClass vdataClass, bit vdata_in, bit hasRestrictedSOffset,
+class getMUBUFAtomicInsDA<RegisterOperand vdata_op, bit vdata_in, bit hasRestrictedSOffset,
                           list<RegisterClass> vaddrList=[]> {
   RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
-  RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret;
 
   dag VData = !if(vdata_in, (ins vdata_op:$vdata_in), (ins vdata_op:$vdata));
   dag Data = !if(!empty(vaddrList), VData, !con(VData, (ins vaddrClass:$vaddr)));
@@ -692,22 +686,20 @@ class getMUBUFAtomicInsDA<RegisterClass vdataClass, bit vdata_in, bit hasRestric
 }
 
 class getMUBUFAtomicIns<int addrKind,
-                        RegisterClass vdataClass,
+                        RegisterOperand vdataClass,
                         bit vdata_in,
-                        bit hasRestrictedSOffset,
-                        // Workaround bug bz30254
-                        RegisterClass vdataClassCopy=vdataClass> {
+                        bit hasRestrictedSOffset> {
   dag ret =
     !if(!eq(addrKind, BUFAddrKind.Offset),
-            getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, hasRestrictedSOffset>.ret,
+            getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset>.ret,
     !if(!eq(addrKind, BUFAddrKind.OffEn),
-            getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, hasRestrictedSOffset, [VGPR_32]>.ret,
+            getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VGPR_32]>.ret,
     !if(!eq(addrKind, BUFAddrKind.IdxEn),
-            getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, hasRestrictedSOffset, [VGPR_32]>.ret,
+            getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VGPR_32]>.ret,
     !if(!eq(addrKind, BUFAddrKind.BothEn),
-            getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, hasRestrictedSOffset, [VReg_64]>.ret,
+            getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VReg_64]>.ret,
     !if(!eq(addrKind, BUFAddrKind.Addr64),
-            getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, hasRestrictedSOffset, [VReg_64]>.ret,
+            getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VReg_64]>.ret,
     (ins))))));
 }
 
@@ -716,11 +708,9 @@ class MUBUF_Atomic_Pseudo<string opName,
                           dag outs,
                           dag ins,
                           string asmOps,
-                          list<dag> pattern=[],
-                          // Workaround bug bz30254
-                          int addrKindCopy = addrKind>
+                          list<dag> pattern=[]>
   : MUBUF_Pseudo<opName, outs, ins, asmOps, pattern>,
-    MUBUF_SetupAddr<addrKindCopy> {
+    MUBUF_SetupAddr<addrKind> {
   let mayStore = 1;
   let mayLoad = 1;
   let hasSideEffects = 1;
@@ -732,18 +722,15 @@ class MUBUF_Atomic_Pseudo<string opName,
 }
 
 class MUBUF_AtomicNoRet_Pseudo<string opName, int addrKind,
-                               RegisterClass vdataClass,
+                               RegisterOperand vdataClass,
                                bit hasRestrictedSOffset = 0,
-                               list<dag> pattern=[],
-                               // Workaround bug bz30254
-                               int addrKindCopy = addrKind,
-                               RegisterClass vdataClassCopy = vdataClass>
-  : MUBUF_Atomic_Pseudo<opName, addrKindCopy,
+                               list<dag> pattern=[]>
+  : MUBUF_Atomic_Pseudo<opName, addrKind,
                         (outs),
-                        getMUBUFAtomicIns<addrKindCopy, vdataClassCopy, 0, hasRestrictedSOffset>.ret,
-                        getMUBUFAsmOps<addrKindCopy>.ret,
+                        getMUBUFAtomicIns<addrKind, vdataClass, 0, hasRestrictedSOffset>.ret,
+                        getMUBUFAsmOps<addrKind>.ret,
                         pattern> {
-  let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
+  let PseudoInstr = opName # "_" # getAddrName<addrKind>.ret;
   let glc_value = 0;
   let dlc_value = 0;
   let sccb_value = 0;
@@ -751,29 +738,24 @@ class MUBUF_AtomicNoRet_Pseudo<string opName, int addrKind,
 }
 
 class MUBUF_AtomicRet_Pseudo<string opName, int addrKind,
-                             RegisterClass vdataClass,
+                             RegisterOperand vdata_op,
                              bit hasRestrictedSOffset = 0,
-                             list<dag> pattern=[],
-                             // Workaround bug bz30254
-                             int addrKindCopy = addrKind,
-                             RegisterClass vdataClassCopy = vdataClass,
-                             RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret>
-  : MUBUF_Atomic_Pseudo<opName, addrKindCopy,
+                             list<dag> pattern=[]>
+  : MUBUF_Atomic_Pseudo<opName, addrKind,
                         (outs vdata_op:$vdata),
-                        getMUBUFAtomicIns<addrKindCopy, vdataClassCopy, 1, hasRestrictedSOffset>.ret,
-                        getMUBUFAsmOps<addrKindCopy>.ret,
+                        getMUBUFAtomicIns<addrKind, vdata_op, 1, hasRestrictedSOffset>.ret,
+                        getMUBUFAsmOps<addrKind>.ret,
                         pattern> {
-  let PseudoInstr = opName # "_rtn_" # getAddrName<addrKindCopy>.ret;
+  let PseudoInstr = opName # "_rtn_" # getAddrName<addrKind>.ret;
   let glc_value = 1;
   let dlc_value = 0;
   let sccb_value = 0;
   let IsAtomicRet = 1;
   let Constraints = "$vdata = $vdata_in";
-  let DisableEncoding = "$vdata_in";
 }
 
 multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName,
-                                        RegisterClass vdataClass,
+                                        RegisterOperand vdataClass,
                                         ValueType vdataType> {
   let FPAtomic = vdataType.isFP in {
     def _OFFSET : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, 0>,
@@ -795,7 +777,7 @@ multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName,
 }
 
 multiclass MUBUF_Pseudo_Atomics_RTN <string opName,
-                                     RegisterClass vdataClass,
+                                     RegisterOperand vdataClass,
                                      ValueType vdataType,
                                      SDPatternOperator atomic> {
   let FPAtomic = vdataType.isFP in {
@@ -834,7 +816,7 @@ multiclass MUBUF_Pseudo_Atomics_RTN <string opName,
 }
 
 multiclass MUBUF_Pseudo_Atomics <string opName,
-                                 RegisterClass vdataClass,
+                                 RegisterOperand vdataClass,
                                  ValueType vdataType,
                                  SDPatternOperator atomic = null_frag> :
   MUBUF_Pseudo_Atomics_NO_RTN<opName, vdataClass, vdataType>,
@@ -1029,87 +1011,87 @@ defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX4", vt, store_global>;
 }
 
 defm BUFFER_ATOMIC_SWAP : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_swap", VGPR_32, i32
+  "buffer_atomic_swap", AVLdSt_32, i32
 >;
 defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_cmpswap", VReg_64, v2i32
+  "buffer_atomic_cmpswap", AVLdSt_64, v2i32
 >;
 defm BUFFER_ATOMIC_ADD : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_add", VGPR_32, i32
+  "buffer_atomic_add", AVLdSt_32, i32
 >;
 defm BUFFER_ATOMIC_SUB : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_sub", VGPR_32, i32
+  "buffer_atomic_sub", AVLdSt_32, i32
 >;
 defm BUFFER_ATOMIC_SMIN : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_smin", VGPR_32, i32
+  "buffer_atomic_smin", AVLdSt_32, i32
 >;
 defm BUFFER_ATOMIC_UMIN : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_umin", VGPR_32, i32
+  "buffer_atomic_umin", AVLdSt_32, i32
 >;
 defm BUFFER_ATOMIC_SMAX : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_smax", VGPR_32, i32
+  "buffer_atomic_smax", AVLdSt_32, i32
 >;
 defm BUFFER_ATOMIC_UMAX : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_umax", VGPR_32, i32
+  "buffer_atomic_umax", AVLdSt_32, i32
 >;
 defm BUFFER_ATOMIC_AND : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_and", VGPR_32, i32
+  "buffer_atomic_and", AVLdSt_32, i32
 >;
 defm BUFFER_ATOMIC_OR : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_or", VGPR_32, i32
+  "buffer_atomic_or", AVLdSt_32, i32
 >;
 defm BUFFER_ATOMIC_XOR : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_xor", VGPR_32, i32
+  "buffer_atomic_xor", AVLdSt_32, i32
 >;
 defm BUFFER_ATOMIC_INC : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_inc", VGPR_32, i32
+  "buffer_atomic_inc", AVLdSt_32, i32
 >;
 defm BUFFER_ATOMIC_DEC : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_dec", VGPR_32, i32
+  "buffer_atomic_dec", AVLdSt_32, i32
 >;
 defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_swap_x2", VReg_64, i64
+  "buffer_atomic_swap_x2", AVLdSt_64, i64
 >;
 defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_cmpswap_x2", VReg_128, v2i64
+  "buffer_atomic_cmpswap_x2", AVLdSt_128, v2i64
 >;
 defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_add_x2", VReg_64, i64
+  "buffer_atomic_add_x2", AVLdSt_64, i64
 >;
 defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_sub_x2", VReg_64, i64
+  "buffer_atomic_sub_x2", AVLdSt_64, i64
 >;
 defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_smin_x2", VReg_64, i64
+  "buffer_atomic_smin_x2", AVLdSt_64, i64
 >;
 defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_umin_x2", VReg_64, i64
+  "buffer_atomic_umin_x2", AVLdSt_64, i64
 >;
 defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_smax_x2", VReg_64, i64
+  "buffer_atomic_smax_x2", AVLdSt_64, i64
 >;
 defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_umax_x2", VReg_64, i64
+  "buffer_atomic_umax_x2", AVLdSt_64, i64
 >;
 defm BUFFER_ATOMIC_AND_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_and_x2", VReg_64, i64
+  "buffer_atomic_and_x2", AVLdSt_64, i64
 >;
 defm BUFFER_ATOMIC_OR_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_or_x2", VReg_64, i64
+  "buffer_atomic_or_x2", AVLdSt_64, i64
 >;
 defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_xor_x2", VReg_64, i64
+  "buffer_atomic_xor_x2", AVLdSt_64, i64
 >;
 defm BUFFER_ATOMIC_INC_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_inc_x2", VReg_64, i64
+  "buffer_atomic_inc_x2", AVLdSt_64, i64
 >;
 defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_dec_x2", VReg_64, i64
+  "buffer_atomic_dec_x2", AVLdSt_64, i64
 >;
 
 let OtherPredicates = [HasGFX10_BEncoding] in {
   defm BUFFER_ATOMIC_CSUB : MUBUF_Pseudo_Atomics <
-    "buffer_atomic_csub", VGPR_32, i32, int_amdgcn_global_atomic_csub
+    "buffer_atomic_csub", VGPROp_32, i32, int_amdgcn_global_atomic_csub
   >;
 }
 
@@ -1130,22 +1112,22 @@ def BUFFER_WBINVL1_SC : MUBUF_Invalidate <"buffer_wbinvl1_sc",
 let SubtargetPredicate = isGFX6GFX7GFX10Plus in {
 
 defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_fcmpswap", VReg_64, v2f32, null_frag
+  "buffer_atomic_fcmpswap", AVLdSt_64, v2f32, null_frag
 >;
 }
 
 let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts in {
 defm BUFFER_ATOMIC_FMIN : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_fmin", VGPR_32, f32, null_frag
+  "buffer_atomic_fmin", AVLdSt_32, f32, null_frag
 >;
 defm BUFFER_ATOMIC_FMAX : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_fmax", VGPR_32, f32, null_frag
+  "buffer_atomic_fmax", AVLdSt_32, f32, null_frag
 >;
 }
 
 let SubtargetPredicate = isGFX6GFX7GFX10 in {
 defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_fcmpswap_x2", VReg_128, v2f64, null_frag
+  "buffer_atomic_fcmpswap_x2", VGPROp_128, v2f64, null_frag
 >;
 }
 
@@ -1204,34 +1186,34 @@ def BUFFER_WBINVL1 : MUBUF_Invalidate <
 
 let SubtargetPredicate = HasAtomicFaddNoRtnInsts in
 defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN<
-  "buffer_atomic_add_f32", VGPR_32, f32
+  "buffer_atomic_add_f32", AVLdSt_32, f32
 >;
 
 let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16NoRtnInsts in
 defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN <
-  "buffer_atomic_pk_add_f16", VGPR_32, v2f16
+  "buffer_atomic_pk_add_f16", AVLdSt_32, v2f16
 >;
 
 let SubtargetPredicate = HasAtomicFaddRtnInsts in
 defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_RTN<
-  "buffer_atomic_add_f32", VGPR_32, f32, null_frag
+  "buffer_atomic_add_f32", AVLdSt_32, f32, null_frag
 >;
 
 let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts in
 defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN <
-  "buffer_atomic_pk_add_f16", VGPR_32, v2f16, null_frag
+  "buffer_atomic_pk_add_f16", AVLdSt_32, v2f16, null_frag
 >;
 
 let SubtargetPredicate = isGFX12Plus in {
 defm BUFFER_ATOMIC_COND_SUB_U32 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_cond_sub_u32", VGPR_32, i32
+  "buffer_atomic_cond_sub_u32", VGPROp_32, i32
 >;
 }
 
 let SubtargetPredicate = HasAtomicBufferPkAddBF16Inst in {
 let FPAtomic = 1 in
 defm BUFFER_ATOMIC_PK_ADD_BF16 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_pk_add_bf16", VGPR_32, v2bf16
+  "buffer_atomic_pk_add_bf16", AVLdSt_32, v2bf16
 >;
 }
 
@@ -1239,39 +1221,39 @@ defm BUFFER_ATOMIC_PK_ADD_BF16 : MUBUF_Pseudo_Atomics <
 // MTBUF Instructions
 //===----------------------------------------------------------------------===//
 let OtherPredicates = [HasMTBUFInsts] in {
-defm TBUFFER_LOAD_FORMAT_X     : MTBUF_Pseudo_Loads  <"tbuffer_load_format_x",     VGPR_32,  1>;
-defm TBUFFER_LOAD_FORMAT_XY    : MTBUF_Pseudo_Loads  <"tbuffer_load_format_xy",    VReg_64,  2>;
-defm TBUFFER_LOAD_FORMAT_XYZ   : MTBUF_Pseudo_Loads  <"tbuffer_load_format_xyz",   VReg_96,  3>;
-defm TBUFFER_LOAD_FORMAT_XYZW  : MTBUF_Pseudo_Loads  <"tbuffer_load_format_xyzw",  VReg_128, 4>;
-defm TBUFFER_STORE_FORMAT_X    : MTBUF_Pseudo_Stores <"tbuffer_store_format_x",    VGPR_32,  1>;
-defm TBUFFER_STORE_FORMAT_XY   : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy",   VReg_64,  2>;
-defm TBUFFER_STORE_FORMAT_XYZ  : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz",  VReg_96,  3>;
-defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128, 4>;
+defm TBUFFER_LOAD_FORMAT_X     : MTBUF_Pseudo_Loads  <"tbuffer_load_format_x",     AVLdSt_32,  1>;
+defm TBUFFER_LOAD_FORMAT_XY    : MTBUF_Pseudo_Loads  <"tbuffer_load_format_xy",    AVLdSt_64,  2>;
+defm TBUFFER_LOAD_FORMAT_XYZ   : MTBUF_Pseudo_Loads  <"tbuffer_load_format_xyz",   AVLdSt_96,  3>;
+defm TBUFFER_LOAD_FORMAT_XYZW  : MTBUF_Pseudo_Loads  <"tbuffer_load_format_xyzw",  AVLdSt_128, 4>;
+defm TBUFFER_STORE_FORMAT_X    : MTBUF_Pseudo_Stores <"tbuffer_store_format_x",    AVLdSt_32,  1>;
+defm TBUFFER_STORE_FORMAT_XY   : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy",   AVLdSt_64,  2>;
+defm TBUFFER_STORE_FORMAT_XYZ  : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz",  AVLdSt_96,  3>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", AVLdSt_128, 4>;
 
 let SubtargetPredicate = HasUnpackedD16VMem, D16Buf = 1 in {
 let TiedSourceNotRead = 1 in {
-  defm TBUFFER_LOAD_FORMAT_D16_X_gfx80     : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_x",     VGPR_32,  1>;
-  defm TBUFFER_LOAD_FORMAT_D16_XY_gfx80    : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_xy",    VReg_64,  2>;
-  defm TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80   : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_xyz",   VReg_96,  3>;
-  defm TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80  : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_xyzw",  VReg_128, 4>;
-}
-  defm TBUFFER_STORE_FORMAT_D16_X_gfx80    : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x",    VGPR_32,  1>;
-  defm TBUFFER_STORE_FORMAT_D16_XY_gfx80   : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy",   VReg_64,  2>;
-  defm TBUFFER_STORE_FORMAT_D16_XYZ_gfx80  : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz",  VReg_96,  3>;
-  defm TBUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_128, 4>;
+  defm TBUFFER_LOAD_FORMAT_D16_X_gfx80     : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_x",     AVLdSt_32,  1>;
+  defm TBUFFER_LOAD_FORMAT_D16_XY_gfx80    : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_xy",    AVLdSt_64,  2>;
+  defm TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80   : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_xyz",   AVLdSt_96,  3>;
+  defm TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80  : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_xyzw",  AVLdSt_128, 4>;
+}
+  defm TBUFFER_STORE_FORMAT_D16_X_gfx80    : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x",    AVLdSt_32,  1>;
+  defm TBUFFER_STORE_FORMAT_D16_XY_gfx80   : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy",   AVLdSt_64,  2>;
+  defm TBUFFER_STORE_FORMAT_D16_XYZ_gfx80  : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz",  AVLdSt_96,  3>;
+  defm TBUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", AVLdSt_128, 4>;
 } // End HasUnpackedD16VMem.
 
 let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in {
 let TiedSourceNotRead = 1 in {
-  defm TBUFFER_LOAD_FORMAT_D16_X     : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_x",     VGPR_32, 1>;
-  defm TBUFFER_LOAD_FORMAT_D16_XY    : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_xy",    VGPR_32, 2>;
-  defm TBUFFER_LOAD_FORMAT_D16_XYZ   : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_xyz",   VReg_64, 3>;
-  defm TBUFFER_LOAD_FORMAT_D16_XYZW  : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_xyzw",  VReg_64, 4>;
-}
-  defm TBUFFER_STORE_FORMAT_D16_X    : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x",    VGPR_32, 1>;
-  defm TBUFFER_STORE_FORMAT_D16_XY   : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy",   VGPR_32, 2>;
-  defm TBUFFER_STORE_FORMAT_D16_XYZ  : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz",  VReg_64, 3>;
-  defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_64, 4>;
+  defm TBUFFER_LOAD_FORMAT_D16_X     : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_x",     AVLdSt_32, 1>;
+  defm TBUFFER_LOAD_FORMAT_D16_XY    : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_xy",    AVLdSt_32, 2>;
+  defm TBUFFER_LOAD_FORMAT_D16_XYZ   : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_xyz",   AVLdSt_64, 3>;
+  defm TBUFFER_LOAD_FORMAT_D16_XYZW  : MTBUF_Pseudo_Loads  <"tbuffer_load_format_d16_xyzw",  AVLdSt_64, 4>;
+}
+  defm TBUFFER_STORE_FORMAT_D16_X    : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x",    AVLdSt_32, 1>;
+  defm TBUFFER_STORE_FORMAT_D16_XY   : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy",   AVLdSt_32, 2>;
+  defm TBUFFER_STORE_FORMAT_D16_XYZ  : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz",  AVLdSt_64, 3>;
+  defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", AVLdSt_64, 4>;
 } // End HasPackedD16VMem.
 } // End HasMTBUFInsts.
 
@@ -1300,14 +1282,14 @@ let SubtargetPredicate = isGFX90APlus in {
 } // End SubtargetPredicate = isGFX90APlus
 
 let SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst in {
-  defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", VReg_64, f64>;
+  defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", AVLdSt_64, f64>;
 } // End SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst
 
 let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
   // Note the names can be buffer_atomic_fmin_x2/buffer_atomic_fmax_x2
   // depending on some subtargets.
-  defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", VReg_64, f64>;
-  defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", VReg_64, f64>;
+  defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", AVLdSt_64, f64>;
+  defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", AVLdSt_64, f64>;
 }
 
 def BUFFER_INV : MUBUF_Invalidate<"buffer_inv"> {
@@ -2414,7 +2396,6 @@ class VBUFFER_Real <bits<8> op, BUF_Pseudo ps, string real_name> :
   let AsmMatchConverter  = ps.AsmMatchConverter;
   let OtherPredicates    = ps.OtherPredicates;
   let Constraints        = ps.Constraints;
-  let DisableEncoding    = ps.DisableEncoding;
   let TSFlags            = ps.TSFlags;
   let UseNamedOperandTable = ps.UseNamedOperandTable;
   let SchedRW            = ps.SchedRW;
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index dc9dd220130e..aae56eef73ed 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -6,7 +6,10 @@ tablegen(LLVM AMDGPUGenAsmMatcher.inc -gen-asm-matcher)
 tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer)
 tablegen(LLVM AMDGPUGenCallingConv.inc -gen-callingconv)
 tablegen(LLVM AMDGPUGenDAGISel.inc -gen-dag-isel)
-tablegen(LLVM AMDGPUGenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM AMDGPUGenDisassemblerTables.inc -gen-disassembler
+              --specialize-decoders-per-bitwidth
+              -ignore-non-decodable-operands
+              -ignore-fully-defined-operands)
 tablegen(LLVM AMDGPUGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM AMDGPUGenMCPseudoLowering.inc -gen-pseudo-lowering)
@@ -71,6 +74,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUImageIntrinsicOptimizer.cpp
   AMDGPULibFunc.cpp
   AMDGPULowerBufferFatPointers.cpp
+  AMDGPULowerIntrinsics.cpp
   AMDGPULowerKernelArguments.cpp
   AMDGPULowerKernelAttributes.cpp
   AMDGPULowerModuleLDSPass.cpp
@@ -82,6 +86,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUMCInstLower.cpp
   AMDGPUMemoryUtils.cpp
   AMDGPUIGroupLP.cpp
+  AMDGPULowerVGPREncoding.cpp
   AMDGPUMCResourceInfo.cpp
   AMDGPUMarkLastScratchLoad.cpp
   AMDGPUMIRFormatter.cpp
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 3ff675d6e5e9..f2e432fa8d7f 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -8,7 +8,7 @@
 
 class DS_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> pattern=[]> :
   InstSI <outs, ins, "", pattern>,
-  SIMCInstr <opName, SIEncodingFamily.NONE> {
+  SIMCInstr <NAME, SIEncodingFamily.NONE> {
 
   let LGKM_CNT = 1;
   let DS = 1;
@@ -19,6 +19,7 @@ class DS_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt
   // Most instruction load and store data, so set this as the default.
   let mayLoad = 1;
   let mayStore = 1;
+  let FixedSize = true;
 
   let hasSideEffects = 0;
   let SchedRW = [WriteLDS];
@@ -76,7 +77,6 @@ class DS_Real <DS_Pseudo ps, string opName = ps.Mnemonic> :
   let isConvergent       = ps.isConvergent;
 
   let Constraints = ps.Constraints;
-  let DisableEncoding = ps.DisableEncoding;
 
   // encoding fields
   bits<10> vdst;
@@ -91,16 +91,33 @@ class DS_Real <DS_Pseudo ps, string opName = ps.Mnemonic> :
   let offset0 = !if(ps.has_offset, offset{7-0}, ?);
   let offset1 = !if(ps.has_offset, offset{15-8}, ?);
 
-  bits<1> acc = !if(ps.has_vdst, vdst{9},
-                    !if(!or(ps.has_data0, ps.has_gws_data0), data0{9}, 0));
+  // Figure out if we should set the acc bit. Simple load and store
+  // instructions with a single data operand can use AV_* classes, in
+  // which case the encoding comes from the assigned register field.
+
+  // For more compliated cases with multiple data operands, since the
+  // register fields are only 8-bit, so data operands must all be AGPR
+  // or VGPR.
+  defvar DstOpIsAV = !if(ps.has_vdst,
+                         VDstOperandIsAV<ps.OutOperandList>.ret, 0);
+  defvar DstOpIsAGPR = !if(ps.has_vdst,
+                           VDstOperandIsAGPR<ps.OutOperandList>.ret, 0);
+  defvar DataOpIsAV = !if(!or(ps.has_data0, ps.has_gws_data0),
+                          Data0OperandIsAV<ps.InOperandList>.ret, 0);
+  defvar DataOpIsAGPR = !if(!or(ps.has_data0, ps.has_gws_data0),
+                            Data0OperandIsAGPR<ps.InOperandList>.ret, 0);
+
+  bits<1> acc = !if(ps.has_vdst,
+                    !if(DstOpIsAV, vdst{9}, DstOpIsAGPR),
+                    !if(DataOpIsAV, data0{9}, DataOpIsAGPR));
 }
 
 // DS Pseudo instructions
 
-class DS_0A1D_NORET<string opName, RegisterClass rc = VGPR_32>
+class DS_0A1D_NORET<string opName, RegisterOperand rc = AVLdSt_32>
 : DS_Pseudo<opName,
   (outs),
-  (ins getLdStRegisterOperand<rc>.ret:$data0, Offset:$offset, gds:$gds),
+  (ins rc:$data0, Offset:$offset, gds:$gds),
   " $data0$offset$gds"> {
 
   let has_addr = 0;
@@ -108,10 +125,10 @@ class DS_0A1D_NORET<string opName, RegisterClass rc = VGPR_32>
   let has_vdst = 0;
 }
 
-class DS_1A1D_NORET<string opName, RegisterClass rc = VGPR_32>
+class DS_1A1D_NORET<string opName, RegisterOperand rc = AVLdSt_32>
 : DS_Pseudo<opName,
   (outs),
-  (ins VGPR_32:$addr, getLdStRegisterOperand<rc>.ret:$data0, Offset:$offset, gds:$gds),
+  (ins VGPR_32:$addr, rc:$data0, Offset:$offset, gds:$gds),
   " $addr, $data0$offset$gds"> {
 
   let has_data1 = 0;
@@ -119,7 +136,7 @@ class DS_1A1D_NORET<string opName, RegisterClass rc = VGPR_32>
   let IsAtomicNoRet = 1;
 }
 
-multiclass DS_1A1D_NORET_mc<string opName, RegisterClass rc = VGPR_32> {
+multiclass DS_1A1D_NORET_mc<string opName, RegisterOperand rc = AVLdSt_32> {
   def "" : DS_1A1D_NORET<opName, rc>;
 
   let has_m0_read = 0 in {
@@ -127,23 +144,23 @@ multiclass DS_1A1D_NORET_mc<string opName, RegisterClass rc = VGPR_32> {
   }
 }
 
-multiclass DS_1A1D_NORET_t16<string opName, RegisterClass rc = VGPR_32> 
+multiclass DS_1A1D_NORET_t16<string opName, RegisterOperand rc = AVLdSt_32>
 : DS_1A1D_NORET_mc<opName, rc> {
   let has_m0_read = 0 in {
     let True16Predicate = UseRealTrue16Insts in {
-      def "_t16" : DS_1A1D_NORET<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_D16_HI", NAME>;
+      def "_t16" : DS_1A1D_NORET<opName#"_t16", VGPROp_16>,
+        True16D16Table<NAME#"_D16_HI", NAME#"_gfx9">;
     }
   }
 }
 
-multiclass DS_1A1D_NORET_mc_gfx9<string opName, RegisterClass rc = VGPR_32> {
+multiclass DS_1A1D_NORET_mc_gfx9<string opName, RegisterOperand rc = AVLdSt_32> {
   let has_m0_read = 0 in {
     def "" : DS_1A1D_NORET<opName, rc>;
   }
 }
 
-class DS_1A2D_NORET<string opName, RegisterClass rc = VGPR_32,
-                    RegisterOperand data_op = getLdStRegisterOperand<rc>.ret>
+class DS_1A2D_NORET<string opName, RegisterOperand data_op = VGPROp_32>
 : DS_Pseudo<opName,
   (outs),
   (ins VGPR_32:$addr, data_op:$data0, data_op:$data1, Offset:$offset, gds:$gds),
@@ -153,16 +170,24 @@ class DS_1A2D_NORET<string opName, RegisterClass rc = VGPR_32,
   let IsAtomicNoRet = 1;
 }
 
-multiclass DS_1A2D_NORET_mc<string opName, RegisterClass rc = VGPR_32> {
+// DS_xx2D cases should only be instantiated with VGPR operand classes.
+multiclass DS_1A2D_NORET_mc<string opName, RegisterOperand rc = VGPROp_32> {
+  assert OperandIsVGPR<rc>.ret,
+         "DS with 2 data operands should be declared with VGPRs";
+
   def "" : DS_1A2D_NORET<opName, rc>;
 
   let has_m0_read = 0 in {
     def _gfx9 : DS_1A2D_NORET<opName, rc>;
+
+    // All data operands are replaced with AGPRs in this form.
+    let SubtargetPredicate = isGFX90APlus in {
+      def _agpr : DS_1A2D_NORET<opName, getEquivalentAGPROperand<rc>.ret>;
+    }
   }
 }
 
-class DS_1A2D_Off8_NORET <string opName, RegisterClass rc = VGPR_32,
-                          RegisterOperand data_op = getLdStRegisterOperand<rc>.ret>
+class DS_1A2D_Off8_NORET <string opName, RegisterOperand data_op = VGPROp_32>
 : DS_Pseudo<opName,
   (outs),
   (ins VGPR_32:$addr, data_op:$data0, data_op:$data1,
@@ -173,17 +198,23 @@ class DS_1A2D_Off8_NORET <string opName, RegisterClass rc = VGPR_32,
   let has_offset = 0;
 }
 
-multiclass DS_1A2D_Off8_NORET_mc <string opName, RegisterClass rc = VGPR_32> {
+multiclass DS_1A2D_Off8_NORET_mc <string opName, RegisterOperand rc = VGPROp_32> {
+  assert OperandIsVGPR<rc>.ret,
+         "DS with 2 data operands should be declared with VGPRs";
+
   def "" : DS_1A2D_Off8_NORET<opName, rc>;
 
   let has_m0_read = 0 in {
     def _gfx9 : DS_1A2D_Off8_NORET<opName, rc>;
+
+    let SubtargetPredicate = isGFX90APlus in {
+      def _agpr : DS_1A2D_Off8_NORET<opName, getEquivalentAGPROperand<rc>.ret>;
+    }
   }
 }
 
-class DS_0A1D_RET_GDS<string opName, RegisterClass rc = VGPR_32, RegisterClass src = rc,
-                  RegisterOperand dst_op = getLdStRegisterOperand<rc>.ret,
-                  RegisterOperand src_op = getLdStRegisterOperand<src>.ret>
+class DS_0A1D_RET_GDS<string opName, RegisterOperand dst_op = AVLdSt_32,
+                                     RegisterOperand src_op = dst_op>
 : DS_Pseudo<opName,
   (outs dst_op:$vdst),
   (ins src_op:$data0, Offset:$offset),
@@ -196,8 +227,7 @@ class DS_0A1D_RET_GDS<string opName, RegisterClass rc = VGPR_32, RegisterClass s
   let hasSideEffects = 1;
 }
 
-class DS_1A1D_RET <string opName, RegisterClass rc = VGPR_32,
-                  RegisterOperand data_op = getLdStRegisterOperand<rc>.ret>
+class DS_1A1D_RET <string opName, RegisterOperand data_op = AVLdSt_32>
 : DS_Pseudo<opName,
   (outs data_op:$vdst),
   (ins VGPR_32:$addr, data_op:$data0, Offset:$offset, gds:$gds),
@@ -207,76 +237,84 @@ class DS_1A1D_RET <string opName, RegisterClass rc = VGPR_32,
   let IsAtomicRet = 1;
 }
 
-multiclass DS_1A1D_RET_mc <string opName, RegisterClass rc = VGPR_32> {
+multiclass DS_1A1D_RET_mc <string opName, RegisterOperand rc = VGPROp_32> {
+  assert OperandIsVGPR<rc>.ret,
+         "DS with 2 data operands should be declared with VGPRs";
+
   def "" : DS_1A1D_RET<opName, rc>;
 
   let has_m0_read = 0 in {
     def _gfx9 : DS_1A1D_RET<opName, rc>;
+    def _agpr : DS_1A1D_RET<opName, getEquivalentAGPROperand<rc>.ret>;
   }
 }
 
-multiclass DS_1A1D_RET_mc_gfx9 <string opName, RegisterClass rc = VGPR_32> {
+multiclass DS_1A1D_RET_mc_gfx9 <string opName, RegisterOperand rc = VGPROp_32> {
   let has_m0_read = 0 in {
     def "" : DS_1A1D_RET<opName, rc>;
+    def _agpr : DS_1A1D_RET<opName, getEquivalentAGPROperand<rc>.ret>;
   }
 }
 
 class DS_1A2D_RET<string opName,
-                  RegisterClass rc = VGPR_32,
-                  RegisterClass src = rc,
-                  RegisterOperand dst_op = getLdStRegisterOperand<rc>.ret,
-                  RegisterOperand src_op = getLdStRegisterOperand<src>.ret>
-: DS_Pseudo<opName,
-  (outs dst_op:$vdst),
-  (ins VGPR_32:$addr, src_op:$data0, src_op:$data1, Offset:$offset, gds:$gds),
+                  RegisterOperand dst_rc = VGPROp_32,
+                  RegisterOperand src_rc = dst_rc>: DS_Pseudo<opName,
+  (outs dst_rc:$vdst),
+  (ins VGPR_32:$addr, src_rc:$data0, src_rc:$data1, Offset:$offset, gds:$gds),
   " $vdst, $addr, $data0, $data1$offset$gds"> {
 
   let IsAtomicRet = 1;
 }
 
 multiclass DS_1A2D_RET_mc<string opName,
-                          RegisterClass rc = VGPR_32,
-                          RegisterClass src = rc> {
-  def "" : DS_1A2D_RET<opName, rc, src>;
+                          RegisterOperand dst_rc = VGPROp_32,
+                          RegisterOperand src_rc = dst_rc> {
+  assert !and(OperandIsVGPR<dst_rc>.ret, OperandIsVGPR<src_rc>.ret),
+         "DS with 2 data operands should be declared with VGPRs";
+
+  def "" : DS_1A2D_RET<opName, dst_rc, src_rc>;
 
   let has_m0_read = 0 in {
-    def _gfx9 : DS_1A2D_RET<opName, rc, src>;
+    def _gfx9 : DS_1A2D_RET<opName, dst_rc, src_rc>;
+    def _agpr : DS_1A2D_RET<opName, getEquivalentAGPROperand<dst_rc>.ret,
+                                    getEquivalentAGPROperand<src_rc>.ret>;
   }
 }
 
 class DS_1A2D_Off8_RET<string opName,
-                       RegisterClass rc = VGPR_32,
-                       RegisterClass src = rc,
-                       RegisterOperand dst_op = getLdStRegisterOperand<rc>.ret,
-                       RegisterOperand src_op = getLdStRegisterOperand<src>.ret>
+                       RegisterOperand dst_rc = VGPROp_32,
+                       RegisterOperand src_rc = dst_rc>
 : DS_Pseudo<opName,
-  (outs dst_op:$vdst),
-  (ins VGPR_32:$addr, src_op:$data0, src_op:$data1, Offset0:$offset0, Offset1:$offset1, gds:$gds),
+  (outs dst_rc:$vdst),
+  (ins VGPR_32:$addr, src_rc:$data0, src_rc:$data1, Offset0:$offset0, Offset1:$offset1, gds:$gds),
   " $vdst, $addr, $data0, $data1$offset0$offset1$gds"> {
 
   let has_offset = 0;
 }
 
 multiclass DS_1A2D_Off8_RET_mc<string opName,
-                               RegisterClass rc = VGPR_32,
-                               RegisterClass src = rc> {
-  def "" : DS_1A2D_Off8_RET<opName, rc, src>;
+                               RegisterOperand dst_rc = VGPROp_32,
+                               RegisterOperand src_rc = dst_rc> {
+  assert !and(OperandIsVGPR<dst_rc>.ret, OperandIsVGPR<src_rc>.ret)  ,
+         "DS with 2 data operands should be declared with VGPRs";
+
+  def "" : DS_1A2D_Off8_RET<opName, dst_rc, src_rc>;
 
   let has_m0_read = 0 in {
-    def _gfx9 : DS_1A2D_Off8_RET<opName, rc, src>;
+    def _gfx9 : DS_1A2D_Off8_RET<opName, dst_rc, src_rc>;
+    def _agpr : DS_1A2D_Off8_RET<opName, getEquivalentAGPROperand<dst_rc>.ret,
+                                         getEquivalentAGPROperand<src_rc>.ret>;
   }
 }
 
 class DS_BVH_STACK<string opName,
-                   RegisterClass vdst_rc,
-                   RegisterClass data1_rc>
+                   RegisterOperand vdst_rc,
+                   RegisterOperand data1_rc>
 : DS_Pseudo<opName,
-  (outs getLdStRegisterOperand<vdst_rc>.ret:$vdst, VGPR_32:$addr),
-  (ins VGPR_32:$addr_in, getLdStRegisterOperand<VGPR_32>.ret:$data0,
-   data1_rc:$data1, Offset:$offset),
+  (outs vdst_rc:$vdst, VGPR_32:$addr),
+  (ins VGPR_32:$addr_in, VGPR_32:$data0, data1_rc:$data1, Offset:$offset),
   " $vdst, $addr, $data0, $data1$offset"> {
   let Constraints = "$addr = $addr_in";
-  let DisableEncoding = "$addr_in";
   let has_gds = 0;
   let gdsValue = 0;
   // TODO: Use MMOs in the LDS address space instead of hasSideEffects = 1.
@@ -284,8 +322,8 @@ class DS_BVH_STACK<string opName,
   let SchedRW = [WriteLDS, WriteLDS];
 }
 
-class DS_1A_RET<string opName, RegisterClass rc = VGPR_32, bit HasTiedOutput = 0, Operand ofs = Offset,
-                RegisterOperand data_op = getLdStRegisterOperand<rc>.ret>
+class DS_1A_RET<string opName, RegisterOperand data_op = AVLdSt_32,
+                bit HasTiedOutput = 0, Operand ofs = Offset>
 : DS_Pseudo<opName,
   (outs data_op:$vdst),
   !if(HasTiedOutput,
@@ -293,12 +331,12 @@ class DS_1A_RET<string opName, RegisterClass rc = VGPR_32, bit HasTiedOutput = 0
     (ins VGPR_32:$addr, ofs:$offset, gds:$gds)),
   " $vdst, $addr$offset$gds"> {
   let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", "");
-  let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
   let has_data0 = 0;
   let has_data1 = 0;
 }
 
-multiclass DS_1A_RET_mc<string opName, RegisterClass rc = VGPR_32, bit HasTiedOutput = 0, Operand ofs = Offset> {
+multiclass DS_1A_RET_mc<string opName, RegisterOperand rc = AVLdSt_32,
+                        bit HasTiedOutput = 0, Operand ofs = Offset> {
   def "" : DS_1A_RET<opName, rc, HasTiedOutput, ofs>;
 
   let has_m0_read = 0 in {
@@ -306,27 +344,28 @@ multiclass DS_1A_RET_mc<string opName, RegisterClass rc = VGPR_32, bit HasTiedOu
   }
 }
 
-multiclass DS_1A_RET_t16<string opName, RegisterClass rc = VGPR_32, bit HasTiedOutput = 0, Operand ofs = Offset> 
+multiclass DS_1A_RET_t16<string opName, RegisterOperand rc = AVLdSt_32,
+                         bit HasTiedOutput = 0, Operand ofs = Offset>
 : DS_1A_RET_mc<opName, rc, HasTiedOutput, ofs> {
   let has_m0_read = 0 in {
     let True16Predicate = UseRealTrue16Insts in {
-      def "_t16" : DS_1A_RET<opName#"_t16", VGPR_16, HasTiedOutput, ofs>, True16D16Table<NAME#"_D16_HI", NAME#"_D16">;
+      def "_t16" : DS_1A_RET<opName#"_t16", VGPROp_16, HasTiedOutput, ofs>, True16D16Table<NAME#"_D16_HI", NAME#"_D16">;
     }
   }
 }
 
-multiclass DS_1A_RET_NoM0<string opName, RegisterClass rc = VGPR_32> {
+multiclass DS_1A_RET_NoM0<string opName, RegisterOperand rc = VGPROp_32> {
   let has_m0_read = 0 in {
     def "" : DS_1A_RET<opName, rc>;
   }
 }
 
-class DS_1A_RET_Tied<string opName, RegisterClass rc = VGPR_32> :
+class DS_1A_RET_Tied<string opName, RegisterOperand rc = AVLdSt_32> :
   DS_1A_RET<opName, rc, 1>;
 
-class DS_1A_Off8_RET <string opName, RegisterClass rc = VGPR_32>
+class DS_1A_Off8_RET <string opName, RegisterOperand rc = AVLdSt_32>
 : DS_Pseudo<opName,
-  (outs getLdStRegisterOperand<rc>.ret:$vdst),
+  (outs rc:$vdst),
   (ins VGPR_32:$addr, Offset0:$offset0, Offset1:$offset1, gds:$gds),
   " $vdst, $addr$offset0$offset1$gds"> {
 
@@ -335,7 +374,7 @@ class DS_1A_Off8_RET <string opName, RegisterClass rc = VGPR_32>
   let has_data1 = 0;
 }
 
-multiclass DS_1A_Off8_RET_mc <string opName, RegisterClass rc = VGPR_32> {
+multiclass DS_1A_Off8_RET_mc <string opName, RegisterOperand rc = VGPROp_32> {
   def "" : DS_1A_Off8_RET<opName, rc>;
 
   let has_m0_read = 0 in {
@@ -344,7 +383,7 @@ multiclass DS_1A_Off8_RET_mc <string opName, RegisterClass rc = VGPR_32> {
 }
 
 class DS_1A_RET_GDS <string opName> : DS_Pseudo<opName,
-  (outs getLdStRegisterOperand<VGPR_32>.ret:$vdst),
+  (outs AVLdSt_32:$vdst),
   (ins VGPR_32:$addr, Offset:$offset),
   " $vdst, $addr$offset gds"> {
 
@@ -369,7 +408,7 @@ class DS_1A_Off16_NORET <string opName>
 }
 
 class DS_0A_RET <string opName> : DS_Pseudo<opName,
-  (outs getLdStRegisterOperand<VGPR_32>.ret:$vdst),
+  (outs AVLdSt_32:$vdst),
   (ins Offset:$offset, gds:$gds),
   " $vdst$offset$gds"> {
 
@@ -424,7 +463,7 @@ class DS_GWS_0D <string opName>
 
 class DS_GWS_1D <string opName>
 : DS_GWS<opName,
-  (ins getLdStRegisterOperand<VGPR_32>.ret:$data0, Offset:$offset),
+  (ins AVLdSt_32:$data0, Offset:$offset),
   " $data0$offset gds"> {
 
   let has_gws_data0 = 1;
@@ -449,7 +488,7 @@ class DS_VOID <string opName> : DS_Pseudo<opName,
 }
 
 class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag,
-                       RegisterOperand data_op = getLdStRegisterOperand<VGPR_32>.ret>
+                       RegisterOperand data_op = AVLdSt_32>
 : DS_Pseudo<opName,
   (outs data_op:$vdst),
   (ins VGPR_32:$addr, data_op:$data0, Offset:$offset),
@@ -465,12 +504,75 @@ class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag,
   let has_gds = 0;
 }
 
+multiclass DS_1A1D_PERMUTE_mc <string opName, SDPatternOperator node = null_frag,
+                                RegisterOperand data_op = VGPROp_32> {
+  assert OperandIsVGPR<data_op>.ret,
+         "DS with 2 data operands should be declared with VGPRs";
+  def "" : DS_1A1D_PERMUTE<opName, node, data_op>;
+
+  let SubtargetPredicate = isGFX90APlus in {
+    def _agpr : DS_1A1D_PERMUTE<opName, null_frag,
+                                getEquivalentAGPROperand<data_op>.ret>;
+  }
+}
+
+
 class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, int complexity = 0,
   bit gds=0> : GCNPat <(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
   (inst $ptr, getVregSrcForVT<vt>.ret:$value, Offset:$offset, (i1 gds))> {
   let AddedComplexity = complexity;
 }
 
+multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
+  let OtherPredicates = [LDSRequiresM0Init] in {
+    def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt)>;
+  }
+
+  let OtherPredicates = [NotLDSRequiresM0Init] in {
+    def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
+                         !cast<PatFrag>(frag#"_local_"#vt)>;
+  }
+
+  let OtherPredicates = [HasGDS] in {
+    def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt),
+                         /* complexity */ 0, /* gds */ 1>;
+  }
+}
+
+multiclass DSAtomicRetNoRetPat_NoM0_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
+                                       ValueType vt, string frag> {
+  def : DSAtomicRetPat<inst, vt,
+                       !cast<PatFrag>(frag#"_local_"#vt)>;
+  def : DSAtomicRetPat<noRetInst, vt,
+                       !cast<PatFrag>(frag#"_local_noret_"#vt), /* complexity */ 1>;
+}
+
+multiclass DSAtomicRetNoRetPat_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
+                                  ValueType vt, string frag> {
+  let OtherPredicates = [LDSRequiresM0Init] in {
+    def : DSAtomicRetPat<inst, vt,
+                         !cast<PatFrag>(frag#"_local_m0_"#vt)>;
+    def : DSAtomicRetPat<noRetInst, vt,
+                         !cast<PatFrag>(frag#"_local_m0_noret_"#vt), /* complexity */ 1>;
+  }
+
+  let OtherPredicates = [NotLDSRequiresM0Init] in {
+    defm : DSAtomicRetNoRetPat_NoM0_mc<
+      !cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"),
+      !cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"),
+      vt, frag>;
+  }
+
+  let OtherPredicates = [HasGDS] in {
+    def : DSAtomicRetPat<inst, vt,
+                         !cast<PatFrag>(frag#"_region_m0_"#vt),
+                         /* complexity */ 0, /* gds */ 1>;
+    def : DSAtomicRetPat<noRetInst, vt,
+                         !cast<PatFrag>(frag#"_region_m0_noret_"#vt),
+                         /* complexity */ 1, /* gds */ 1>;
+  }
+}
+
 defm DS_ADD_U32       : DS_1A1D_NORET_mc<"ds_add_u32">;
 defm DS_SUB_U32       : DS_1A1D_NORET_mc<"ds_sub_u32">;
 defm DS_RSUB_U32      : DS_1A1D_NORET_mc<"ds_rsub_u32">;
@@ -516,100 +618,100 @@ def DS_WRITE_ADDTID_B32 : DS_0A1D_NORET<"ds_write_addtid_b32">;
 } // End mayLoad = 0
 
 let SubtargetPredicate = HasLdsAtomicAddF64 in {
-  defm DS_ADD_F64     : DS_1A1D_NORET_mc_gfx9<"ds_add_f64", VReg_64>;
-  defm DS_ADD_RTN_F64 : DS_1A1D_RET_mc_gfx9<"ds_add_rtn_f64", VReg_64>;
+  defm DS_ADD_F64     : DS_1A1D_NORET_mc_gfx9<"ds_add_f64", AVLdSt_64>;
+  defm DS_ADD_RTN_F64 : DS_1A1D_RET_mc_gfx9<"ds_add_rtn_f64", VGPROp_64>;
 } // End SubtargetPredicate = HasLdsAtomicAddF64
 
 let SubtargetPredicate = HasAtomicDsPkAdd16Insts in {
-  defm DS_PK_ADD_F16      : DS_1A1D_NORET_mc<"ds_pk_add_f16">;
-  defm DS_PK_ADD_RTN_F16  : DS_1A1D_RET_mc<"ds_pk_add_rtn_f16", VGPR_32>;
-  defm DS_PK_ADD_BF16     : DS_1A1D_NORET_mc<"ds_pk_add_bf16">;
-  defm DS_PK_ADD_RTN_BF16 : DS_1A1D_RET_mc<"ds_pk_add_rtn_bf16", VGPR_32>;
+  defm DS_PK_ADD_F16      : DS_1A1D_NORET_mc_gfx9<"ds_pk_add_f16">;
+  defm DS_PK_ADD_RTN_F16  : DS_1A1D_RET_mc_gfx9<"ds_pk_add_rtn_f16">;
+  defm DS_PK_ADD_BF16     : DS_1A1D_NORET_mc_gfx9<"ds_pk_add_bf16">;
+  defm DS_PK_ADD_RTN_BF16 : DS_1A1D_RET_mc_gfx9<"ds_pk_add_rtn_bf16">;
 } // End SubtargetPredicate = HasAtomicDsPkAdd16Insts
 
 defm DS_CMPSTORE_B32     : DS_1A2D_NORET_mc<"ds_cmpstore_b32">;
 defm DS_CMPSTORE_F32     : DS_1A2D_NORET_mc<"ds_cmpstore_f32">;
-defm DS_CMPSTORE_B64     : DS_1A2D_NORET_mc<"ds_cmpstore_b64", VReg_64>;
-defm DS_CMPSTORE_F64     : DS_1A2D_NORET_mc<"ds_cmpstore_f64", VReg_64>;
-defm DS_CMPSTORE_RTN_B32 : DS_1A2D_RET_mc<"ds_cmpstore_rtn_b32", VGPR_32>;
-defm DS_CMPSTORE_RTN_F32 : DS_1A2D_RET_mc<"ds_cmpstore_rtn_f32", VGPR_32>;
-defm DS_CMPSTORE_RTN_B64  : DS_1A2D_RET_mc<"ds_cmpstore_rtn_b64", VReg_64>;
-defm DS_CMPSTORE_RTN_F64  : DS_1A2D_RET_mc<"ds_cmpstore_rtn_f64", VReg_64>;
+defm DS_CMPSTORE_B64     : DS_1A2D_NORET_mc<"ds_cmpstore_b64", VGPROp_64>;
+defm DS_CMPSTORE_F64     : DS_1A2D_NORET_mc<"ds_cmpstore_f64", VGPROp_64>;
+defm DS_CMPSTORE_RTN_B32 : DS_1A2D_RET_mc<"ds_cmpstore_rtn_b32">;
+defm DS_CMPSTORE_RTN_F32 : DS_1A2D_RET_mc<"ds_cmpstore_rtn_f32">;
+defm DS_CMPSTORE_RTN_B64  : DS_1A2D_RET_mc<"ds_cmpstore_rtn_b64", VGPROp_64>;
+defm DS_CMPSTORE_RTN_F64  : DS_1A2D_RET_mc<"ds_cmpstore_rtn_f64", VGPROp_64>;
 
 defm DS_MSKOR_B32     : DS_1A2D_NORET_mc<"ds_mskor_b32">;
 defm DS_CMPST_B32     : DS_1A2D_NORET_mc<"ds_cmpst_b32">;
 defm DS_CMPST_F32     : DS_1A2D_NORET_mc<"ds_cmpst_f32">;
 
-defm DS_ADD_U64       : DS_1A1D_NORET_mc<"ds_add_u64", VReg_64>;
-defm DS_SUB_U64       : DS_1A1D_NORET_mc<"ds_sub_u64", VReg_64>;
-defm DS_RSUB_U64      : DS_1A1D_NORET_mc<"ds_rsub_u64", VReg_64>;
-defm DS_INC_U64       : DS_1A1D_NORET_mc<"ds_inc_u64", VReg_64>;
-defm DS_DEC_U64       : DS_1A1D_NORET_mc<"ds_dec_u64", VReg_64>;
-defm DS_MIN_I64       : DS_1A1D_NORET_mc<"ds_min_i64", VReg_64>;
-defm DS_MAX_I64       : DS_1A1D_NORET_mc<"ds_max_i64", VReg_64>;
-defm DS_MIN_U64       : DS_1A1D_NORET_mc<"ds_min_u64", VReg_64>;
-defm DS_MAX_U64       : DS_1A1D_NORET_mc<"ds_max_u64", VReg_64>;
-defm DS_AND_B64       : DS_1A1D_NORET_mc<"ds_and_b64", VReg_64>;
-defm DS_OR_B64        : DS_1A1D_NORET_mc<"ds_or_b64", VReg_64>;
-defm DS_XOR_B64       : DS_1A1D_NORET_mc<"ds_xor_b64", VReg_64>;
-defm DS_MSKOR_B64     : DS_1A2D_NORET_mc<"ds_mskor_b64", VReg_64>;
+defm DS_ADD_U64       : DS_1A1D_NORET_mc<"ds_add_u64", AVLdSt_64>;
+defm DS_SUB_U64       : DS_1A1D_NORET_mc<"ds_sub_u64", AVLdSt_64>;
+defm DS_RSUB_U64      : DS_1A1D_NORET_mc<"ds_rsub_u64", AVLdSt_64>;
+defm DS_INC_U64       : DS_1A1D_NORET_mc<"ds_inc_u64", AVLdSt_64>;
+defm DS_DEC_U64       : DS_1A1D_NORET_mc<"ds_dec_u64", AVLdSt_64>;
+defm DS_MIN_I64       : DS_1A1D_NORET_mc<"ds_min_i64", AVLdSt_64>;
+defm DS_MAX_I64       : DS_1A1D_NORET_mc<"ds_max_i64", AVLdSt_64>;
+defm DS_MIN_U64       : DS_1A1D_NORET_mc<"ds_min_u64", AVLdSt_64>;
+defm DS_MAX_U64       : DS_1A1D_NORET_mc<"ds_max_u64", AVLdSt_64>;
+defm DS_AND_B64       : DS_1A1D_NORET_mc<"ds_and_b64", AVLdSt_64>;
+defm DS_OR_B64        : DS_1A1D_NORET_mc<"ds_or_b64", AVLdSt_64>;
+defm DS_XOR_B64       : DS_1A1D_NORET_mc<"ds_xor_b64", AVLdSt_64>;
+defm DS_MSKOR_B64     : DS_1A2D_NORET_mc<"ds_mskor_b64", VGPROp_64>;
 let mayLoad = 0 in {
-defm DS_WRITE_B64     : DS_1A1D_NORET_mc<"ds_write_b64", VReg_64>;
-defm DS_WRITE2_B64    : DS_1A2D_Off8_NORET_mc<"ds_write2_b64", VReg_64>;
-defm DS_WRITE2ST64_B64: DS_1A2D_Off8_NORET_mc<"ds_write2st64_b64", VReg_64>;
+defm DS_WRITE_B64     : DS_1A1D_NORET_mc<"ds_write_b64", AVLdSt_64>;
+defm DS_WRITE2_B64    : DS_1A2D_Off8_NORET_mc<"ds_write2_b64", VGPROp_64>;
+defm DS_WRITE2ST64_B64: DS_1A2D_Off8_NORET_mc<"ds_write2st64_b64", VGPROp_64>;
 }
-defm DS_CMPST_B64     : DS_1A2D_NORET_mc<"ds_cmpst_b64", VReg_64>;
-defm DS_CMPST_F64     : DS_1A2D_NORET_mc<"ds_cmpst_f64", VReg_64>;
-defm DS_MIN_F64       : DS_1A1D_NORET_mc<"ds_min_f64", VReg_64>;
-defm DS_MAX_F64       : DS_1A1D_NORET_mc<"ds_max_f64", VReg_64>;
+defm DS_CMPST_B64     : DS_1A2D_NORET_mc<"ds_cmpst_b64", VGPROp_64>;
+defm DS_CMPST_F64     : DS_1A2D_NORET_mc<"ds_cmpst_f64", VGPROp_64>;
+defm DS_MIN_F64       : DS_1A1D_NORET_mc<"ds_min_f64", AVLdSt_64>;
+defm DS_MAX_F64       : DS_1A1D_NORET_mc<"ds_max_f64", AVLdSt_64>;
 
-defm DS_ADD_RTN_U32   : DS_1A1D_RET_mc<"ds_add_rtn_u32", VGPR_32>;
+defm DS_ADD_RTN_U32   : DS_1A1D_RET_mc<"ds_add_rtn_u32">;
 
 let SubtargetPredicate = HasLDSFPAtomicAddF32 in {
-defm DS_ADD_RTN_F32   : DS_1A1D_RET_mc<"ds_add_rtn_f32", VGPR_32>;
-}
-defm DS_SUB_RTN_U32   : DS_1A1D_RET_mc<"ds_sub_rtn_u32", VGPR_32>;
-defm DS_RSUB_RTN_U32  : DS_1A1D_RET_mc<"ds_rsub_rtn_u32", VGPR_32>;
-defm DS_INC_RTN_U32   : DS_1A1D_RET_mc<"ds_inc_rtn_u32", VGPR_32>;
-defm DS_DEC_RTN_U32   : DS_1A1D_RET_mc<"ds_dec_rtn_u32", VGPR_32>;
-defm DS_MIN_RTN_I32   : DS_1A1D_RET_mc<"ds_min_rtn_i32", VGPR_32>;
-defm DS_MAX_RTN_I32   : DS_1A1D_RET_mc<"ds_max_rtn_i32", VGPR_32>;
-defm DS_MIN_RTN_U32   : DS_1A1D_RET_mc<"ds_min_rtn_u32", VGPR_32>;
-defm DS_MAX_RTN_U32   : DS_1A1D_RET_mc<"ds_max_rtn_u32", VGPR_32>;
-defm DS_AND_RTN_B32   : DS_1A1D_RET_mc<"ds_and_rtn_b32", VGPR_32>;
-defm DS_OR_RTN_B32    : DS_1A1D_RET_mc<"ds_or_rtn_b32", VGPR_32>;
-defm DS_XOR_RTN_B32   : DS_1A1D_RET_mc<"ds_xor_rtn_b32", VGPR_32>;
-defm DS_MSKOR_RTN_B32 : DS_1A2D_RET_mc<"ds_mskor_rtn_b32", VGPR_32>;
-defm DS_CMPST_RTN_B32 : DS_1A2D_RET_mc<"ds_cmpst_rtn_b32", VGPR_32>;
-defm DS_CMPST_RTN_F32 : DS_1A2D_RET_mc<"ds_cmpst_rtn_f32", VGPR_32>;
-defm DS_MIN_RTN_F32   : DS_1A1D_RET_mc<"ds_min_rtn_f32", VGPR_32>;
-defm DS_MAX_RTN_F32   : DS_1A1D_RET_mc<"ds_max_rtn_f32", VGPR_32>;
+defm DS_ADD_RTN_F32   : DS_1A1D_RET_mc<"ds_add_rtn_f32">;
+}
+defm DS_SUB_RTN_U32   : DS_1A1D_RET_mc<"ds_sub_rtn_u32">;
+defm DS_RSUB_RTN_U32  : DS_1A1D_RET_mc<"ds_rsub_rtn_u32">;
+defm DS_INC_RTN_U32   : DS_1A1D_RET_mc<"ds_inc_rtn_u32">;
+defm DS_DEC_RTN_U32   : DS_1A1D_RET_mc<"ds_dec_rtn_u32">;
+defm DS_MIN_RTN_I32   : DS_1A1D_RET_mc<"ds_min_rtn_i32">;
+defm DS_MAX_RTN_I32   : DS_1A1D_RET_mc<"ds_max_rtn_i32">;
+defm DS_MIN_RTN_U32   : DS_1A1D_RET_mc<"ds_min_rtn_u32">;
+defm DS_MAX_RTN_U32   : DS_1A1D_RET_mc<"ds_max_rtn_u32">;
+defm DS_AND_RTN_B32   : DS_1A1D_RET_mc<"ds_and_rtn_b32">;
+defm DS_OR_RTN_B32    : DS_1A1D_RET_mc<"ds_or_rtn_b32">;
+defm DS_XOR_RTN_B32   : DS_1A1D_RET_mc<"ds_xor_rtn_b32">;
+defm DS_MSKOR_RTN_B32 : DS_1A2D_RET_mc<"ds_mskor_rtn_b32", VGPROp_32>;
+defm DS_CMPST_RTN_B32 : DS_1A2D_RET_mc<"ds_cmpst_rtn_b32", VGPROp_32>;
+defm DS_CMPST_RTN_F32 : DS_1A2D_RET_mc<"ds_cmpst_rtn_f32", VGPROp_32>;
+defm DS_MIN_RTN_F32   : DS_1A1D_RET_mc<"ds_min_rtn_f32">;
+defm DS_MAX_RTN_F32   : DS_1A1D_RET_mc<"ds_max_rtn_f32">;
 
 defm DS_WRXCHG_RTN_B32 : DS_1A1D_RET_mc<"ds_wrxchg_rtn_b32">;
-defm DS_WRXCHG2_RTN_B32 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2_rtn_b32", VReg_64, VGPR_32>;
-defm DS_WRXCHG2ST64_RTN_B32 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2st64_rtn_b32", VReg_64, VGPR_32>;
-
-defm DS_ADD_RTN_U64  : DS_1A1D_RET_mc<"ds_add_rtn_u64", VReg_64>;
-defm DS_SUB_RTN_U64  : DS_1A1D_RET_mc<"ds_sub_rtn_u64", VReg_64>;
-defm DS_RSUB_RTN_U64  : DS_1A1D_RET_mc<"ds_rsub_rtn_u64", VReg_64>;
-defm DS_INC_RTN_U64   : DS_1A1D_RET_mc<"ds_inc_rtn_u64", VReg_64>;
-defm DS_DEC_RTN_U64   : DS_1A1D_RET_mc<"ds_dec_rtn_u64", VReg_64>;
-defm DS_MIN_RTN_I64    : DS_1A1D_RET_mc<"ds_min_rtn_i64", VReg_64>;
-defm DS_MAX_RTN_I64    : DS_1A1D_RET_mc<"ds_max_rtn_i64", VReg_64>;
-defm DS_MIN_RTN_U64   : DS_1A1D_RET_mc<"ds_min_rtn_u64", VReg_64>;
-defm DS_MAX_RTN_U64   : DS_1A1D_RET_mc<"ds_max_rtn_u64", VReg_64>;
-defm DS_AND_RTN_B64    : DS_1A1D_RET_mc<"ds_and_rtn_b64", VReg_64>;
-defm DS_OR_RTN_B64     : DS_1A1D_RET_mc<"ds_or_rtn_b64", VReg_64>;
-defm DS_XOR_RTN_B64    : DS_1A1D_RET_mc<"ds_xor_rtn_b64", VReg_64>;
-defm DS_MSKOR_RTN_B64  : DS_1A2D_RET_mc<"ds_mskor_rtn_b64", VReg_64>;
-defm DS_CMPST_RTN_B64  : DS_1A2D_RET_mc<"ds_cmpst_rtn_b64", VReg_64>;
-defm DS_CMPST_RTN_F64  : DS_1A2D_RET_mc<"ds_cmpst_rtn_f64", VReg_64>;
-defm DS_MIN_RTN_F64    : DS_1A1D_RET_mc<"ds_min_rtn_f64", VReg_64>;
-defm DS_MAX_RTN_F64    : DS_1A1D_RET_mc<"ds_max_rtn_f64", VReg_64>;
-
-defm DS_WRXCHG_RTN_B64 : DS_1A1D_RET_mc<"ds_wrxchg_rtn_b64", VReg_64>;
-defm DS_WRXCHG2_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>;
-defm DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>;
+defm DS_WRXCHG2_RTN_B32 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2_rtn_b32", VGPROp_64, VGPROp_32>;
+defm DS_WRXCHG2ST64_RTN_B32 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2st64_rtn_b32", VGPROp_64, VGPROp_32>;
+
+defm DS_ADD_RTN_U64  : DS_1A1D_RET_mc<"ds_add_rtn_u64", VGPROp_64>;
+defm DS_SUB_RTN_U64  : DS_1A1D_RET_mc<"ds_sub_rtn_u64", VGPROp_64>;
+defm DS_RSUB_RTN_U64  : DS_1A1D_RET_mc<"ds_rsub_rtn_u64", VGPROp_64>;
+defm DS_INC_RTN_U64   : DS_1A1D_RET_mc<"ds_inc_rtn_u64", VGPROp_64>;
+defm DS_DEC_RTN_U64   : DS_1A1D_RET_mc<"ds_dec_rtn_u64", VGPROp_64>;
+defm DS_MIN_RTN_I64    : DS_1A1D_RET_mc<"ds_min_rtn_i64", VGPROp_64>;
+defm DS_MAX_RTN_I64    : DS_1A1D_RET_mc<"ds_max_rtn_i64", VGPROp_64>;
+defm DS_MIN_RTN_U64   : DS_1A1D_RET_mc<"ds_min_rtn_u64", VGPROp_64>;
+defm DS_MAX_RTN_U64   : DS_1A1D_RET_mc<"ds_max_rtn_u64", VGPROp_64>;
+defm DS_AND_RTN_B64    : DS_1A1D_RET_mc<"ds_and_rtn_b64", VGPROp_64>;
+defm DS_OR_RTN_B64     : DS_1A1D_RET_mc<"ds_or_rtn_b64", VGPROp_64>;
+defm DS_XOR_RTN_B64    : DS_1A1D_RET_mc<"ds_xor_rtn_b64", VGPROp_64>;
+defm DS_MSKOR_RTN_B64  : DS_1A2D_RET_mc<"ds_mskor_rtn_b64", VGPROp_64>;
+defm DS_CMPST_RTN_B64  : DS_1A2D_RET_mc<"ds_cmpst_rtn_b64", VGPROp_64>;
+defm DS_CMPST_RTN_F64  : DS_1A2D_RET_mc<"ds_cmpst_rtn_f64", VGPROp_64>;
+defm DS_MIN_RTN_F64    : DS_1A1D_RET_mc<"ds_min_rtn_f64", VGPROp_64>;
+defm DS_MAX_RTN_F64    : DS_1A1D_RET_mc<"ds_max_rtn_f64", VGPROp_64>;
+
+defm DS_WRXCHG_RTN_B64 : DS_1A1D_RET_mc<"ds_wrxchg_rtn_b64", VGPROp_64>;
+defm DS_WRXCHG2_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2_rtn_b64", VGPROp_128, VGPROp_64>;
+defm DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2st64_rtn_b64", VGPROp_128, VGPROp_64>;
 
 let isConvergent = 1, usesCustomInserter = 1 in {
 def DS_GWS_INIT       : DS_GWS_1D<"ds_gws_init"> {
@@ -657,19 +759,19 @@ def DS_WRITE_SRC2_B64 : DS_1A<"ds_write_src2_b64">;
 } // End SubtargetPredicate = HasDsSrc2Insts
 
 let Uses = [EXEC], mayLoad = 0, mayStore = 0, isConvergent = 1 in {
-def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", VGPR_32, 0, Swizzle>;
+def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", AVLdSt_32, 0, Swizzle>;
 }
 
 let mayStore = 0 in {
 defm DS_READ_I16     : DS_1A_RET_mc<"ds_read_i16">;
 defm DS_READ_B32     : DS_1A_RET_mc<"ds_read_b32">;
-defm DS_READ_B64     : DS_1A_RET_mc<"ds_read_b64", VReg_64>;
+defm DS_READ_B64     : DS_1A_RET_mc<"ds_read_b64", AVLdSt_64>;
 
-defm DS_READ2_B32    : DS_1A_Off8_RET_mc<"ds_read2_b32", VReg_64>;
-defm DS_READ2ST64_B32: DS_1A_Off8_RET_mc<"ds_read2st64_b32", VReg_64>;
+defm DS_READ2_B32    : DS_1A_Off8_RET_mc<"ds_read2_b32", AVLdSt_64>;
+defm DS_READ2ST64_B32: DS_1A_Off8_RET_mc<"ds_read2st64_b32", AVLdSt_64>;
 
-defm DS_READ2_B64    : DS_1A_Off8_RET_mc<"ds_read2_b64", VReg_128>;
-defm DS_READ2ST64_B64: DS_1A_Off8_RET_mc<"ds_read2st64_b64", VReg_128>;
+defm DS_READ2_B64    : DS_1A_Off8_RET_mc<"ds_read2_b64", AVLdSt_128>;
+defm DS_READ2ST64_B64: DS_1A_Off8_RET_mc<"ds_read2st64_b64", AVLdSt_128>;
 
 let has_m0_read = 0 in {
 let SubtargetPredicate = HasD16LoadStore, TiedSourceNotRead = 1 in {
@@ -704,21 +806,21 @@ def DS_ORDERED_COUNT : DS_1A_RET_GDS<"ds_ordered_count">;
 
 let SubtargetPredicate = isGFX7Plus in {
 
-defm DS_WRAP_RTN_B32 : DS_1A2D_RET_mc<"ds_wrap_rtn_b32", VGPR_32>;
-defm DS_CONDXCHG32_RTN_B64 : DS_1A1D_RET_mc<"ds_condxchg32_rtn_b64", VReg_64>;
+defm DS_WRAP_RTN_B32 : DS_1A2D_RET_mc<"ds_wrap_rtn_b32", VGPROp_32>;
+defm DS_CONDXCHG32_RTN_B64 : DS_1A1D_RET_mc<"ds_condxchg32_rtn_b64", VGPROp_64>;
 
 let isConvergent = 1, usesCustomInserter = 1 in {
 def DS_GWS_SEMA_RELEASE_ALL : DS_GWS_0D<"ds_gws_sema_release_all">;
 }
 
 let mayStore = 0 in {
-defm DS_READ_B96 : DS_1A_RET_mc<"ds_read_b96", VReg_96>;
-defm DS_READ_B128: DS_1A_RET_mc<"ds_read_b128", VReg_128>;
+defm DS_READ_B96 : DS_1A_RET_mc<"ds_read_b96", AVLdSt_96>;
+defm DS_READ_B128: DS_1A_RET_mc<"ds_read_b128", AVLdSt_128>;
 } // End mayStore = 0
 
 let mayLoad = 0 in {
-defm DS_WRITE_B96 : DS_1A1D_NORET_mc<"ds_write_b96", VReg_96>;
-defm DS_WRITE_B128 : DS_1A1D_NORET_mc<"ds_write_b128", VReg_128>;
+defm DS_WRITE_B96 : DS_1A1D_NORET_mc<"ds_write_b96", AVLdSt_96>;
+defm DS_WRITE_B128 : DS_1A1D_NORET_mc<"ds_write_b128", AVLdSt_128>;
 } // End mayLoad = 0
 
 def DS_NOP : DS_VOID<"ds_nop">;
@@ -732,10 +834,10 @@ def DS_NOP : DS_VOID<"ds_nop">;
 let SubtargetPredicate = isGFX8Plus in {
 
 let Uses = [EXEC] in {
-def DS_PERMUTE_B32  : DS_1A1D_PERMUTE <"ds_permute_b32",
-                                       int_amdgcn_ds_permute>;
-def DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <"ds_bpermute_b32",
-                                       int_amdgcn_ds_bpermute>;
+defm DS_PERMUTE_B32  : DS_1A1D_PERMUTE_mc<"ds_permute_b32",
+                                         int_amdgcn_ds_permute>;
+defm DS_BPERMUTE_B32 : DS_1A1D_PERMUTE_mc<"ds_bpermute_b32",
+                                         int_amdgcn_ds_bpermute>;
 }
 
 } // let SubtargetPredicate = isGFX8Plus
@@ -751,8 +853,8 @@ def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">;
 
 let SubtargetPredicate = isGFX11Only in {
 
-def DS_ADD_GS_REG_RTN : DS_0A1D_RET_GDS<"ds_add_gs_reg_rtn", VReg_64, VGPR_32>;
-def DS_SUB_GS_REG_RTN : DS_0A1D_RET_GDS<"ds_sub_gs_reg_rtn", VReg_64, VGPR_32>;
+def DS_ADD_GS_REG_RTN : DS_0A1D_RET_GDS<"ds_add_gs_reg_rtn", VGPROp_64, VGPROp_32>;
+def DS_SUB_GS_REG_RTN : DS_0A1D_RET_GDS<"ds_sub_gs_reg_rtn", VGPROp_64, VGPROp_32>;
 
 } // let SubtargetPredicate = isGFX11Only
 
@@ -760,7 +862,7 @@ let SubtargetPredicate = isGFX11Plus in {
 
 let OtherPredicates = [HasImageInsts] in
 def DS_BVH_STACK_RTN_B32 : DS_BVH_STACK<"ds_bvh_stack_rtn_b32",
-                                        VGPR_32, VReg_128> ;
+                                        VGPROp_32, VGPROp_128> ;
 
 } // let SubtargetPredicate = isGFX11Plus
 
@@ -772,15 +874,15 @@ let SubtargetPredicate = isGFX12Plus in {
 
 let OtherPredicates = [HasImageInsts] in {
 def DS_BVH_STACK_PUSH8_POP1_RTN_B32 : DS_BVH_STACK<
-  "ds_bvh_stack_push8_pop1_rtn_b32", VGPR_32, VReg_256>;
+  "ds_bvh_stack_push8_pop1_rtn_b32", VGPROp_32, VGPROp_256>;
 def DS_BVH_STACK_PUSH8_POP2_RTN_B64 : DS_BVH_STACK<
-  "ds_bvh_stack_push8_pop2_rtn_b64", VReg_64, VReg_256>;
+  "ds_bvh_stack_push8_pop2_rtn_b64", VGPROp_64, VGPROp_256>;
 } // End OtherPredicates = [HasImageInsts].
 
-defm DS_COND_SUB_U32      : DS_1A1D_NORET_mc<"ds_cond_sub_u32">;
-defm DS_COND_SUB_RTN_U32  : DS_1A1D_RET_mc<"ds_cond_sub_rtn_u32", VGPR_32>;
-defm DS_SUB_CLAMP_U32     : DS_1A1D_NORET_mc<"ds_sub_clamp_u32">;
-defm DS_SUB_CLAMP_RTN_U32 : DS_1A1D_RET_mc<"ds_sub_clamp_rtn_u32", VGPR_32>;
+defm DS_COND_SUB_U32      : DS_1A1D_NORET_mc_gfx9<"ds_cond_sub_u32">;
+defm DS_COND_SUB_RTN_U32  : DS_1A1D_RET_mc_gfx9<"ds_cond_sub_rtn_u32", VGPROp_32>;
+defm DS_SUB_CLAMP_U32     : DS_1A1D_NORET_mc_gfx9<"ds_sub_clamp_u32">;
+defm DS_SUB_CLAMP_RTN_U32 : DS_1A1D_RET_mc_gfx9<"ds_sub_clamp_rtn_u32", VGPROp_32>;
 def DS_BPERMUTE_FI_B32    : DS_1A1D_PERMUTE <"ds_bpermute_fi_b32",
                                              int_amdgcn_ds_bpermute_fi_b32>;
 
@@ -801,11 +903,11 @@ let SubtargetPredicate = isGFX1250Plus in {
 
 let WaveSizePredicate = isWave32, mayStore = 0 in {
 let OtherPredicates = [HasTransposeLoadF4F6Insts] in {
-defm DS_LOAD_TR4_B64   : DS_1A_RET_NoM0<"ds_load_tr4_b64",   VReg_64>;
-defm DS_LOAD_TR6_B96   : DS_1A_RET_NoM0<"ds_load_tr6_b96",   VReg_96>;
+defm DS_LOAD_TR4_B64   : DS_1A_RET_NoM0<"ds_load_tr4_b64",   VGPROp_64>;
+defm DS_LOAD_TR6_B96   : DS_1A_RET_NoM0<"ds_load_tr6_b96",   VGPROp_96>;
 } // End OtherPredicates = [HasTransposeLoadF4F6Insts]
-defm DS_LOAD_TR8_B64   : DS_1A_RET_NoM0<"ds_load_tr8_b64",   VReg_64>;
-defm DS_LOAD_TR16_B128 : DS_1A_RET_NoM0<"ds_load_tr16_b128", VReg_128>;
+defm DS_LOAD_TR8_B64   : DS_1A_RET_NoM0<"ds_load_tr8_b64",   VGPROp_64>;
+defm DS_LOAD_TR16_B128 : DS_1A_RET_NoM0<"ds_load_tr16_b128", VGPROp_128>;
 } // End WaveSizePredicate = isWave32, mayStore = 0
 
 let OtherPredicates = [HasLdsBarrierArriveAtomic] in {
@@ -818,7 +920,7 @@ def : GCNPat <
   (DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 VGPR_32:$ptr, Offset:$offset, (i1 0))
 >;
 
-defm DS_ATOMIC_BARRIER_ARRIVE_RTN_B64 : DS_1A1D_RET_mc_gfx9<"ds_atomic_barrier_arrive_rtn_b64", VReg_64>;
+defm DS_ATOMIC_BARRIER_ARRIVE_RTN_B64 : DS_1A1D_RET_mc_gfx9<"ds_atomic_barrier_arrive_rtn_b64", VGPROp_64>;
 
 def : GCNPat<
   (i64 (int_amdgcn_ds_atomic_barrier_arrive_rtn_b64 (DS1Addr1Offset i32:$ptr, i32:$offset), i64:$data)),
@@ -829,10 +931,10 @@ def : GCNPat<
 } // End SubtargetPredicate = isGFX1250Plus
 
 let WaveSizePredicate = isWave64, SubtargetPredicate = HasGFX950Insts, mayStore = 0 in {
-  defm DS_READ_B64_TR_B4  : DS_1A_RET_NoM0<"ds_read_b64_tr_b4", VReg_64>;
-  defm DS_READ_B64_TR_B8  : DS_1A_RET_NoM0<"ds_read_b64_tr_b8", VReg_64>;
-  defm DS_READ_B64_TR_B16 : DS_1A_RET_NoM0<"ds_read_b64_tr_b16", VReg_64>;
-  defm DS_READ_B96_TR_B6  : DS_1A_RET_NoM0<"ds_read_b96_tr_b6", VReg_96>;
+  defm DS_READ_B64_TR_B4  : DS_1A_RET_NoM0<"ds_read_b64_tr_b4", AVLdSt_64>;
+  defm DS_READ_B64_TR_B8  : DS_1A_RET_NoM0<"ds_read_b64_tr_b8", AVLdSt_64>;
+  defm DS_READ_B64_TR_B16 : DS_1A_RET_NoM0<"ds_read_b64_tr_b16", AVLdSt_64>;
+  defm DS_READ_B96_TR_B6  : DS_1A_RET_NoM0<"ds_read_b96_tr_b6", AVLdSt_96>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -984,6 +1086,7 @@ class DS64Bit4ByteAlignedReadPat<DS_Pseudo inst, ValueType vt, PatFrag frag> : G
   (inst $ptr, $offset0, $offset1, (i1 0))
 >;
 
+// TODO: Should this use AVLdSt_64 for the class?
 class DS64Bit4ByteAlignedWritePat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat<
   (frag vt:$value, (DS64Bit4ByteAligned i32:$ptr, i32:$offset0, i32:$offset1)),
   (inst $ptr, (i32 (EXTRACT_SUBREG VReg_64:$value, sub0)),
@@ -1091,50 +1194,6 @@ defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align_less_than_4_local">;
 
 } // End AddedComplexity = 100
 
-multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
-  let OtherPredicates = [LDSRequiresM0Init] in {
-    def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt)>;
-  }
-
-  let OtherPredicates = [NotLDSRequiresM0Init] in {
-    def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
-                         !cast<PatFrag>(frag#"_local_"#vt)>;
-  }
-
-  let OtherPredicates = [HasGDS] in {
-    def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt),
-                         /* complexity */ 0, /* gds */ 1>;
-  }
-}
-
-multiclass DSAtomicRetNoRetPat_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
-                                  ValueType vt, string frag> {
-  let OtherPredicates = [LDSRequiresM0Init] in {
-    def : DSAtomicRetPat<inst, vt,
-                         !cast<PatFrag>(frag#"_local_m0_"#vt)>;
-    def : DSAtomicRetPat<noRetInst, vt,
-                         !cast<PatFrag>(frag#"_local_m0_noret_"#vt), /* complexity */ 1>;
-  }
-
-  let OtherPredicates = [NotLDSRequiresM0Init] in {
-    def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
-                         !cast<PatFrag>(frag#"_local_"#vt)>;
-    def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt,
-                         !cast<PatFrag>(frag#"_local_noret_"#vt), /* complexity */ 1>;
-  }
-
-  let OtherPredicates = [HasGDS] in {
-    def : DSAtomicRetPat<inst, vt,
-                         !cast<PatFrag>(frag#"_region_m0_"#vt),
-                         /* complexity */ 0, /* gds */ 1>;
-    def : DSAtomicRetPat<noRetInst, vt,
-                         !cast<PatFrag>(frag#"_region_m0_noret_"#vt),
-                         /* complexity */ 1, /* gds */ 1>;
-  }
-}
-
-
-
 let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in {
 // Caution, the order of src and cmp is the *opposite* of the BUFFER_ATOMIC_CMPSWAP opcode.
 class DSAtomicCmpXChgSwapped<DS_Pseudo inst, ValueType vt, PatFrag frag,
@@ -1212,8 +1271,8 @@ defm : DSAtomicRetNoRetPat_mc<DS_MAX_RTN_F32, DS_MAX_F32, f32, "atomic_load_fmax
 
 
 let SubtargetPredicate = HasAtomicDsPkAdd16Insts in {
-defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "atomic_load_fadd">;
-defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_BF16, DS_PK_ADD_BF16, v2bf16, "atomic_load_fadd">;
+defm : DSAtomicRetNoRetPat_NoM0_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "atomic_load_fadd">;
+defm : DSAtomicRetNoRetPat_NoM0_mc<DS_PK_ADD_RTN_BF16, DS_PK_ADD_BF16, v2bf16, "atomic_load_fadd">;
 }
 
 let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in {
@@ -1265,7 +1324,7 @@ class DSAtomicRetPatIntrinsic<DS_Pseudo inst, ValueType vt, PatFrag frag,
 } // End SubtargetPredicate = HasLdsAtomicAddF64
 
 let SubtargetPredicate = HasAtomicDsPkAdd16Insts in {
-defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "atomic_load_fadd">;
+defm : DSAtomicRetNoRetPat_NoM0_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "atomic_load_fadd">;
 } // End SubtargetPredicate = HasAtomicDsPkAdd16Insts
 
 let OtherPredicates = [HasGDS] in
@@ -1357,8 +1416,10 @@ class Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<8> op, DS_Pseudo ps, int ef,
 // GFX12.
 //===----------------------------------------------------------------------===//
 
-multiclass DS_Real_gfx12<bits<8> op, string name = !tolower(NAME)> {
-  defvar ps = !cast<DS_Pseudo>(NAME);
+multiclass DS_Real_gfx12<bits<8> op,
+                         DS_Pseudo ps = !cast<DS_Pseudo>(NAME),
+                         string name = !tolower(NAME)> {
+
   let AssemblerPredicate = isGFX12Plus in {
     let DecoderNamespace = "GFX12" in
       def _gfx12 :
@@ -1369,14 +1430,20 @@ multiclass DS_Real_gfx12<bits<8> op, string name = !tolower(NAME)> {
   } // End AssemblerPredicate
 }
 
-defm DS_MIN_F32           : DS_Real_gfx12<0x012, "ds_min_num_f32">;
-defm DS_MAX_F32           : DS_Real_gfx12<0x013, "ds_max_num_f32">;
-defm DS_MIN_RTN_F32       : DS_Real_gfx12<0x032, "ds_min_num_rtn_f32">;
-defm DS_MAX_RTN_F32       : DS_Real_gfx12<0x033, "ds_max_num_rtn_f32">;
-defm DS_MIN_F64           : DS_Real_gfx12<0x052, "ds_min_num_f64">;
-defm DS_MAX_F64           : DS_Real_gfx12<0x053, "ds_max_num_f64">;
-defm DS_MIN_RTN_F64       : DS_Real_gfx12<0x072, "ds_min_num_rtn_f64">;
-defm DS_MAX_RTN_F64       : DS_Real_gfx12<0x073, "ds_max_num_rtn_f64">;
+// Helper to avoid repeating the pseudo-name if we only need to set
+// the gfx12 name.
+multiclass DS_Real_gfx12_with_name<bits<8> op, string name> {
+  defm "" : DS_Real_gfx12<op, !cast<DS_Pseudo>(NAME#"_gfx9"), name>;
+}
+
+defm DS_MIN_F32           : DS_Real_gfx12_with_name<0x012, "ds_min_num_f32">;
+defm DS_MAX_F32           : DS_Real_gfx12_with_name<0x013, "ds_max_num_f32">;
+defm DS_MIN_RTN_F32       : DS_Real_gfx12_with_name<0x032, "ds_min_num_rtn_f32">;
+defm DS_MAX_RTN_F32       : DS_Real_gfx12_with_name<0x033, "ds_max_num_rtn_f32">;
+defm DS_MIN_F64           : DS_Real_gfx12_with_name<0x052, "ds_min_num_f64">;
+defm DS_MAX_F64           : DS_Real_gfx12_with_name<0x053, "ds_max_num_f64">;
+defm DS_MIN_RTN_F64       : DS_Real_gfx12_with_name<0x072, "ds_min_num_rtn_f64">;
+defm DS_MAX_RTN_F64       : DS_Real_gfx12_with_name<0x073, "ds_max_num_rtn_f64">;
 defm DS_COND_SUB_U32      : DS_Real_gfx12<0x098>;
 defm DS_SUB_CLAMP_U32     : DS_Real_gfx12<0x099>;
 defm DS_COND_SUB_RTN_U32  : DS_Real_gfx12<0x0a8>;
@@ -1392,8 +1459,8 @@ defm DS_LOAD_TR6_B96      : DS_Real_gfx12<0x0fb>;
 defm DS_LOAD_TR16_B128    : DS_Real_gfx12<0x0fc>;
 defm DS_LOAD_TR8_B64      : DS_Real_gfx12<0x0fd>;
 
-defm DS_BVH_STACK_RTN_B32             : DS_Real_gfx12<0x0e0,
-  "ds_bvh_stack_push4_pop1_rtn_b32">;
+defm DS_BVH_STACK_RTN_B32 : DS_Real_gfx12<0x0e0, DS_BVH_STACK_RTN_B32,
+                                          "ds_bvh_stack_push4_pop1_rtn_b32">;
 defm DS_BVH_STACK_PUSH8_POP1_RTN_B32  : DS_Real_gfx12<0x0e1>;
 defm DS_BVH_STACK_PUSH8_POP2_RTN_B64  : DS_Real_gfx12<0x0e2>;
 
@@ -1421,8 +1488,8 @@ def : MnemonicAlias<"ds_load_tr_b128", "ds_load_tr16_b128">, Requires<[isGFX1250
 // GFX11.
 //===----------------------------------------------------------------------===//
 
-multiclass DS_Real_gfx11<bits<8> op, string name = !tolower(NAME)> {
-  defvar ps = !cast<DS_Pseudo>(NAME);
+multiclass DS_Real_gfx11<bits<8> op, DS_Pseudo ps = !cast<DS_Pseudo>(NAME#"_gfx9"),
+                                     string name = !tolower(NAME)> {
   let AssemblerPredicate = isGFX11Only in {
     let DecoderNamespace = "GFX11" in
       def _gfx11 :
@@ -1433,8 +1500,11 @@ multiclass DS_Real_gfx11<bits<8> op, string name = !tolower(NAME)> {
   } // End AssemblerPredicate
 }
 
-multiclass DS_Real_gfx11_gfx12<bits<8> op, string name = !tolower(NAME)>
-  : DS_Real_gfx11<op, name>, DS_Real_gfx12<op, name>;
+multiclass DS_Real_gfx11_gfx12<bits<8> op,
+                               string name = !tolower(NAME),
+                               DS_Pseudo ps = !cast<DS_Pseudo>(NAME#"_gfx9")>
+  : DS_Real_gfx11<op, ps, name>,
+    DS_Real_gfx12<op, ps, name>;
 
 defm DS_WRITE_B32           : DS_Real_gfx11_gfx12<0x00d, "ds_store_b32">;
 defm DS_WRITE2_B32          : DS_Real_gfx11_gfx12<0x00e, "ds_store_2addr_b32">;
@@ -1460,16 +1530,16 @@ defm DS_WRXCHG2ST64_RTN_B64 : DS_Real_gfx11_gfx12<0x06f, "ds_storexchg_2addr_str
 defm DS_READ_B64            : DS_Real_gfx11_gfx12<0x076, "ds_load_b64">;
 defm DS_READ2_B64           : DS_Real_gfx11_gfx12<0x077, "ds_load_2addr_b64">;
 defm DS_READ2ST64_B64       : DS_Real_gfx11_gfx12<0x078, "ds_load_2addr_stride64_b64">;
-defm DS_WRITE_B8_D16_HI     : DS_Real_gfx11_gfx12<0x0a0, "ds_store_b8_d16_hi">;
-defm DS_WRITE_B16_D16_HI    : DS_Real_gfx11_gfx12<0x0a1, "ds_store_b16_d16_hi">;
-defm DS_READ_U8_D16         : DS_Real_gfx11_gfx12<0x0a2, "ds_load_u8_d16">;
-defm DS_READ_U8_D16_HI      : DS_Real_gfx11_gfx12<0x0a3, "ds_load_u8_d16_hi">;
-defm DS_READ_I8_D16         : DS_Real_gfx11_gfx12<0x0a4, "ds_load_i8_d16">;
-defm DS_READ_I8_D16_HI      : DS_Real_gfx11_gfx12<0x0a5, "ds_load_i8_d16_hi">;
-defm DS_READ_U16_D16        : DS_Real_gfx11_gfx12<0x0a6, "ds_load_u16_d16">;
-defm DS_READ_U16_D16_HI     : DS_Real_gfx11_gfx12<0x0a7, "ds_load_u16_d16_hi">;
-defm DS_WRITE_ADDTID_B32    : DS_Real_gfx11_gfx12<0x0b0, "ds_store_addtid_b32">;
-defm DS_READ_ADDTID_B32     : DS_Real_gfx11_gfx12<0x0b1, "ds_load_addtid_b32">;
+defm DS_WRITE_B8_D16_HI     : DS_Real_gfx11_gfx12<0x0a0, "ds_store_b8_d16_hi", DS_WRITE_B8_D16_HI>;
+defm DS_WRITE_B16_D16_HI    : DS_Real_gfx11_gfx12<0x0a1, "ds_store_b16_d16_hi", DS_WRITE_B16_D16_HI>;
+defm DS_READ_U8_D16         : DS_Real_gfx11_gfx12<0x0a2, "ds_load_u8_d16", DS_READ_U8_D16>;
+defm DS_READ_U8_D16_HI      : DS_Real_gfx11_gfx12<0x0a3, "ds_load_u8_d16_hi", DS_READ_U8_D16_HI>;
+defm DS_READ_I8_D16         : DS_Real_gfx11_gfx12<0x0a4, "ds_load_i8_d16", DS_READ_I8_D16>;
+defm DS_READ_I8_D16_HI      : DS_Real_gfx11_gfx12<0x0a5, "ds_load_i8_d16_hi", DS_READ_I8_D16_HI>;
+defm DS_READ_U16_D16        : DS_Real_gfx11_gfx12<0x0a6, "ds_load_u16_d16", DS_READ_U16_D16>;
+defm DS_READ_U16_D16_HI     : DS_Real_gfx11_gfx12<0x0a7, "ds_load_u16_d16_hi", DS_READ_U16_D16_HI>;
+defm DS_WRITE_ADDTID_B32    : DS_Real_gfx11_gfx12<0x0b0, "ds_store_addtid_b32", DS_WRITE_ADDTID_B32>;
+defm DS_READ_ADDTID_B32     : DS_Real_gfx11_gfx12<0x0b1, "ds_load_addtid_b32", DS_READ_ADDTID_B32>;
 defm DS_WRITE_B96           : DS_Real_gfx11_gfx12<0x0de, "ds_store_b96">;
 defm DS_WRITE_B128          : DS_Real_gfx11_gfx12<0x0df, "ds_store_b128">;
 defm DS_READ_B96            : DS_Real_gfx11_gfx12<0x0fe, "ds_load_b96">;
@@ -1489,22 +1559,22 @@ defm DS_CMPSTORE_RTN_B64                 : DS_Real_gfx11_gfx12<0x070>;
 defm DS_CMPSTORE_RTN_F64                 : DS_Real_gfx11<0x071>;
 
 defm DS_ADD_RTN_F32                      : DS_Real_gfx11_gfx12<0x079>;
-defm DS_ADD_GS_REG_RTN                   : DS_Real_gfx11<0x07a>;
-defm DS_SUB_GS_REG_RTN                   : DS_Real_gfx11<0x07b>;
-defm DS_BVH_STACK_RTN_B32                : DS_Real_gfx11<0x0ad>;
+defm DS_ADD_GS_REG_RTN                   : DS_Real_gfx11<0x07a, DS_ADD_GS_REG_RTN>;
+defm DS_SUB_GS_REG_RTN                   : DS_Real_gfx11<0x07b, DS_SUB_GS_REG_RTN>;
+defm DS_BVH_STACK_RTN_B32                : DS_Real_gfx11<0x0ad, DS_BVH_STACK_RTN_B32>;
 
 //===----------------------------------------------------------------------===//
 // GFX10.
 //===----------------------------------------------------------------------===//
 
 let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
-  multiclass DS_Real_gfx10<bits<8> op>  {
+  multiclass DS_Real_gfx10<bits<8> op, DS_Pseudo ps = !cast<DS_Pseudo>(NAME)>  {
     def _gfx10 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op,
-      !cast<DS_Pseudo>(NAME), SIEncodingFamily.GFX10>;
+      ps, SIEncodingFamily.GFX10>;
   }
 } // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
 
-defm DS_ADD_RTN_F32      : DS_Real_gfx10<0x055>;
+defm DS_ADD_RTN_F32      : DS_Real_gfx10<0x055, DS_ADD_RTN_F32_gfx9>;
 defm DS_WRITE_B8_D16_HI  : DS_Real_gfx10<0x0a0>;
 defm DS_WRITE_B16_D16_HI : DS_Real_gfx10<0x0a1>;
 defm DS_READ_U8_D16      : DS_Real_gfx10<0x0a2>;
@@ -1520,39 +1590,48 @@ defm DS_READ_ADDTID_B32  : DS_Real_gfx10<0x0b1>;
 // GFX10, GFX11, GFX12.
 //===----------------------------------------------------------------------===//
 
-multiclass DS_Real_gfx10_gfx11_gfx12<bits<8> op> :
-  DS_Real_gfx10<op>, DS_Real_gfx11<op>, DS_Real_gfx12<op>;
+multiclass DS_Real_gfx10_gfx11_gfx12<bits<8> op, DS_Pseudo ps = !cast<DS_Pseudo>(NAME#"_gfx9")> :
+  DS_Real_gfx10<op, ps>,
+  DS_Real_gfx11<op, ps>,
+  DS_Real_gfx12<op, ps>;
 
-multiclass DS_Real_gfx10_gfx11<bits<8> op> :
-  DS_Real_gfx10<op>, DS_Real_gfx11<op>;
+multiclass DS_Real_gfx10_gfx11<bits<8> op, DS_Pseudo ps = !cast<DS_Pseudo>(NAME#"_gfx9")> :
+  DS_Real_gfx10<op, ps>, DS_Real_gfx11<op, ps>;
 
 defm DS_ADD_F32          : DS_Real_gfx10_gfx11_gfx12<0x015>;
 defm DS_ADD_SRC2_F32     : DS_Real_gfx10<0x095>;
-defm DS_PERMUTE_B32      : DS_Real_gfx10_gfx11_gfx12<0x0b2>;
-defm DS_BPERMUTE_B32     : DS_Real_gfx10_gfx11_gfx12<0x0b3>;
+defm DS_PERMUTE_B32      : DS_Real_gfx10_gfx11_gfx12<0x0b2, DS_PERMUTE_B32>;
+defm DS_BPERMUTE_B32     : DS_Real_gfx10_gfx11_gfx12<0x0b3, DS_BPERMUTE_B32>;
 
 //===----------------------------------------------------------------------===//
 // GFX7, GFX10, GFX11, GFX12.
 //===----------------------------------------------------------------------===//
 
 let AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" in {
-  multiclass DS_Real_gfx7<bits<8> op> {
+  multiclass DS_Real_gfx7<bits<8> op, DS_Pseudo ps> {
     def _gfx7 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op,
-      !cast<DS_Pseudo>(NAME), SIEncodingFamily.SI>;
+      ps, SIEncodingFamily.SI>;
   }
 } // End AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7"
 
-multiclass DS_Real_gfx7_gfx10_gfx11_gfx12<bits<8> op> :
-  DS_Real_gfx7<op>, DS_Real_gfx10_gfx11_gfx12<op>;
+multiclass DS_Real_gfx7_gfx10_gfx11_gfx12<bits<8> op,
+           DS_Pseudo ps_gfx6 = !cast<DS_Pseudo>(NAME),
+           DS_Pseudo ps_gfx9 = !cast<DS_Pseudo>(NAME#"_gfx9")> :
+  DS_Real_gfx7<op, ps_gfx6>,
+  DS_Real_gfx10_gfx11_gfx12<op, ps_gfx9>;
 
-multiclass DS_Real_gfx7_gfx10_gfx11<bits<8> op> :
-  DS_Real_gfx7<op>, DS_Real_gfx10_gfx11<op>;
+multiclass DS_Real_gfx7_gfx10_gfx11<bits<8> op,
+           DS_Pseudo ps_gfx6 = !cast<DS_Pseudo>(NAME),
+           DS_Pseudo ps_gfx9 = !cast<DS_Pseudo>(NAME#"_gfx9")> :
+  DS_Real_gfx7<op, ps_gfx6>, DS_Real_gfx10_gfx11<op, ps_gfx9>;
 
-multiclass DS_Real_gfx7_gfx10<bits<8> op> :
-  DS_Real_gfx7<op>, DS_Real_gfx10<op>;
+multiclass DS_Real_gfx7_gfx10<bits<8> op,
+           DS_Pseudo ps_gfx6 = !cast<DS_Pseudo>(NAME),
+           DS_Pseudo ps_gfx9 = !cast<DS_Pseudo>(NAME#"_gfx9")> :
+  DS_Real_gfx7<op, ps_gfx6>, DS_Real_gfx10<op, ps_gfx9>;
 
 // FIXME-GFX7: Add tests when upstreaming this part.
-defm DS_GWS_SEMA_RELEASE_ALL : DS_Real_gfx7_gfx10_gfx11<0x018>;
+defm DS_GWS_SEMA_RELEASE_ALL : DS_Real_gfx7_gfx10_gfx11<0x018, DS_GWS_SEMA_RELEASE_ALL, DS_GWS_SEMA_RELEASE_ALL>;
 defm DS_WRAP_RTN_B32         : DS_Real_gfx7_gfx10_gfx11<0x034>;
 defm DS_CONDXCHG32_RTN_B64   : DS_Real_gfx7_gfx10_gfx11_gfx12<0x07e>;
 defm DS_WRITE_B96            : DS_Real_gfx7_gfx10<0x0de>;
@@ -1565,20 +1644,27 @@ defm DS_READ_B128            : DS_Real_gfx7_gfx10<0x0ff>;
 //===----------------------------------------------------------------------===//
 
 let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
-  multiclass DS_Real_gfx6_gfx7<bits<8> op> {
+  multiclass DS_Real_gfx6_gfx7<bits<8> op, DS_Pseudo ps> {
     def _gfx6_gfx7 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op,
-      !cast<DS_Pseudo>(NAME), SIEncodingFamily.SI>;
+      ps, SIEncodingFamily.SI>;
   }
 } // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7"
 
-multiclass DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<8> op> :
-  DS_Real_gfx6_gfx7<op>, DS_Real_gfx10_gfx11_gfx12<op>;
+multiclass DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<8> op,
+           DS_Pseudo ps_gfx6 = !cast<DS_Pseudo>(NAME),
+           DS_Pseudo ps_gfx9 = !cast<DS_Pseudo>(NAME#"_gfx9")> :
+  DS_Real_gfx6_gfx7<op, ps_gfx6>,
+  DS_Real_gfx10_gfx11_gfx12<op, ps_gfx9>;
 
-multiclass DS_Real_gfx6_gfx7_gfx10_gfx11<bits<8> op> :
-  DS_Real_gfx6_gfx7<op>, DS_Real_gfx10_gfx11<op>;
+multiclass DS_Real_gfx6_gfx7_gfx10_gfx11<bits<8> op,
+           DS_Pseudo ps_gfx6 = !cast<DS_Pseudo>(NAME),
+           DS_Pseudo ps_gfx9 = !cast<DS_Pseudo>(NAME#"_gfx9")> :
+  DS_Real_gfx6_gfx7<op, ps_gfx6>, DS_Real_gfx10_gfx11<op, ps_gfx9>;
 
-multiclass DS_Real_gfx6_gfx7_gfx10<bits<8> op> :
-  DS_Real_gfx6_gfx7<op>, DS_Real_gfx10<op>;
+multiclass DS_Real_gfx6_gfx7_gfx10<bits<8> op,
+                                   DS_Pseudo ps_gfx6 = !cast<DS_Pseudo>(NAME),
+                                   DS_Pseudo ps_gfx9 = !cast<DS_Pseudo>(NAME#"_gfx9")> :
+  DS_Real_gfx6_gfx7<op, ps_gfx6>, DS_Real_gfx10<op, ps_gfx9>;
 
 defm DS_ADD_U32             : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x000>;
 defm DS_SUB_U32             : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x001>;
@@ -1602,12 +1688,12 @@ defm DS_CMPST_F32           : DS_Real_gfx6_gfx7_gfx10<0x011>;
 
 defm DS_MIN_F32             : DS_Real_gfx6_gfx7_gfx10_gfx11<0x012>;
 defm DS_MAX_F32             : DS_Real_gfx6_gfx7_gfx10_gfx11<0x013>;
-defm DS_NOP                 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x014>;
-defm DS_GWS_INIT            : DS_Real_gfx6_gfx7_gfx10_gfx11<0x019>;
-defm DS_GWS_SEMA_V          : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01a>;
-defm DS_GWS_SEMA_BR         : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01b>;
-defm DS_GWS_SEMA_P          : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01c>;
-defm DS_GWS_BARRIER         : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01d>;
+defm DS_NOP                 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x014, DS_NOP, DS_NOP>;
+defm DS_GWS_INIT            : DS_Real_gfx6_gfx7_gfx10_gfx11<0x019, DS_GWS_INIT, DS_GWS_INIT>;
+defm DS_GWS_SEMA_V          : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01a, DS_GWS_SEMA_V, DS_GWS_SEMA_V>;
+defm DS_GWS_SEMA_BR         : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01b, DS_GWS_SEMA_BR, DS_GWS_SEMA_BR>;
+defm DS_GWS_SEMA_P          : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01c, DS_GWS_SEMA_P, DS_GWS_SEMA_P>;
+defm DS_GWS_BARRIER         : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01d, DS_GWS_BARRIER, DS_GWS_BARRIER>;
 
 defm DS_WRITE_B8            : DS_Real_gfx6_gfx7_gfx10<0x01e>;
 defm DS_WRITE_B16           : DS_Real_gfx6_gfx7_gfx10<0x01f>;
@@ -1634,7 +1720,7 @@ defm DS_CMPST_RTN_F32       : DS_Real_gfx6_gfx7_gfx10<0x031>;
 
 defm DS_MIN_RTN_F32         : DS_Real_gfx6_gfx7_gfx10_gfx11<0x032>;
 defm DS_MAX_RTN_F32         : DS_Real_gfx6_gfx7_gfx10_gfx11<0x033>;
-defm DS_SWIZZLE_B32         : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x035>;
+defm DS_SWIZZLE_B32         : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x035, DS_SWIZZLE_B32, DS_SWIZZLE_B32>;
 
 defm DS_READ_B32            : DS_Real_gfx6_gfx7_gfx10<0x036>;
 defm DS_READ2_B32           : DS_Real_gfx6_gfx7_gfx10<0x037>;
@@ -1644,9 +1730,9 @@ defm DS_READ_U8             : DS_Real_gfx6_gfx7_gfx10<0x03a>;
 defm DS_READ_I16            : DS_Real_gfx6_gfx7_gfx10<0x03b>;
 defm DS_READ_U16            : DS_Real_gfx6_gfx7_gfx10<0x03c>;
 
-defm DS_CONSUME             : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x03d>;
-defm DS_APPEND              : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x03e>;
-defm DS_ORDERED_COUNT       : DS_Real_gfx6_gfx7_gfx10_gfx11<0x03f>;
+defm DS_CONSUME             : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x03d, DS_CONSUME, DS_CONSUME>;
+defm DS_APPEND              : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x03e, DS_APPEND, DS_APPEND>;
+defm DS_ORDERED_COUNT       : DS_Real_gfx6_gfx7_gfx10_gfx11<0x03f, DS_ORDERED_COUNT, DS_ORDERED_COUNT>;
 defm DS_ADD_U64             : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x040>;
 defm DS_SUB_U64             : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x041>;
 defm DS_RSUB_U64            : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x042>;
@@ -1695,42 +1781,42 @@ defm DS_MAX_RTN_F64         : DS_Real_gfx6_gfx7_gfx10_gfx11<0x073>;
 defm DS_READ_B64            : DS_Real_gfx6_gfx7_gfx10<0x076>;
 defm DS_READ2_B64           : DS_Real_gfx6_gfx7_gfx10<0x077>;
 defm DS_READ2ST64_B64       : DS_Real_gfx6_gfx7_gfx10<0x078>;
-defm DS_ADD_SRC2_U32        : DS_Real_gfx6_gfx7_gfx10<0x080>;
-defm DS_SUB_SRC2_U32        : DS_Real_gfx6_gfx7_gfx10<0x081>;
-defm DS_RSUB_SRC2_U32       : DS_Real_gfx6_gfx7_gfx10<0x082>;
-defm DS_INC_SRC2_U32        : DS_Real_gfx6_gfx7_gfx10<0x083>;
-defm DS_DEC_SRC2_U32        : DS_Real_gfx6_gfx7_gfx10<0x084>;
-defm DS_MIN_SRC2_I32        : DS_Real_gfx6_gfx7_gfx10<0x085>;
-defm DS_MAX_SRC2_I32        : DS_Real_gfx6_gfx7_gfx10<0x086>;
-defm DS_MIN_SRC2_U32        : DS_Real_gfx6_gfx7_gfx10<0x087>;
-defm DS_MAX_SRC2_U32        : DS_Real_gfx6_gfx7_gfx10<0x088>;
-defm DS_AND_SRC2_B32        : DS_Real_gfx6_gfx7_gfx10<0x089>;
-defm DS_OR_SRC2_B32         : DS_Real_gfx6_gfx7_gfx10<0x08a>;
-defm DS_XOR_SRC2_B32        : DS_Real_gfx6_gfx7_gfx10<0x08b>;
-defm DS_WRITE_SRC2_B32      : DS_Real_gfx6_gfx7_gfx10<0x08d>;
-defm DS_MIN_SRC2_F32        : DS_Real_gfx6_gfx7_gfx10<0x092>;
-defm DS_MAX_SRC2_F32        : DS_Real_gfx6_gfx7_gfx10<0x093>;
-defm DS_ADD_SRC2_U64        : DS_Real_gfx6_gfx7_gfx10<0x0c0>;
-defm DS_SUB_SRC2_U64        : DS_Real_gfx6_gfx7_gfx10<0x0c1>;
-defm DS_RSUB_SRC2_U64       : DS_Real_gfx6_gfx7_gfx10<0x0c2>;
-defm DS_INC_SRC2_U64        : DS_Real_gfx6_gfx7_gfx10<0x0c3>;
-defm DS_DEC_SRC2_U64        : DS_Real_gfx6_gfx7_gfx10<0x0c4>;
-defm DS_MIN_SRC2_I64        : DS_Real_gfx6_gfx7_gfx10<0x0c5>;
-defm DS_MAX_SRC2_I64        : DS_Real_gfx6_gfx7_gfx10<0x0c6>;
-defm DS_MIN_SRC2_U64        : DS_Real_gfx6_gfx7_gfx10<0x0c7>;
-defm DS_MAX_SRC2_U64        : DS_Real_gfx6_gfx7_gfx10<0x0c8>;
-defm DS_AND_SRC2_B64        : DS_Real_gfx6_gfx7_gfx10<0x0c9>;
-defm DS_OR_SRC2_B64         : DS_Real_gfx6_gfx7_gfx10<0x0ca>;
-defm DS_XOR_SRC2_B64        : DS_Real_gfx6_gfx7_gfx10<0x0cb>;
-defm DS_WRITE_SRC2_B64      : DS_Real_gfx6_gfx7_gfx10<0x0cd>;
-defm DS_MIN_SRC2_F64        : DS_Real_gfx6_gfx7_gfx10<0x0d2>;
-defm DS_MAX_SRC2_F64        : DS_Real_gfx6_gfx7_gfx10<0x0d3>;
+defm DS_ADD_SRC2_U32        : DS_Real_gfx6_gfx7_gfx10<0x080, DS_ADD_SRC2_U32, DS_ADD_SRC2_U32>;
+defm DS_SUB_SRC2_U32        : DS_Real_gfx6_gfx7_gfx10<0x081, DS_SUB_SRC2_U32, DS_SUB_SRC2_U32>;
+defm DS_RSUB_SRC2_U32       : DS_Real_gfx6_gfx7_gfx10<0x082, DS_RSUB_SRC2_U32, DS_RSUB_SRC2_U32>;
+defm DS_INC_SRC2_U32        : DS_Real_gfx6_gfx7_gfx10<0x083, DS_INC_SRC2_U32, DS_INC_SRC2_U32>;
+defm DS_DEC_SRC2_U32        : DS_Real_gfx6_gfx7_gfx10<0x084, DS_DEC_SRC2_U32, DS_DEC_SRC2_U32>;
+defm DS_MIN_SRC2_I32        : DS_Real_gfx6_gfx7_gfx10<0x085, DS_MIN_SRC2_I32, DS_MIN_SRC2_I32>;
+defm DS_MAX_SRC2_I32        : DS_Real_gfx6_gfx7_gfx10<0x086, DS_MAX_SRC2_I32, DS_MAX_SRC2_I32>;
+defm DS_MIN_SRC2_U32        : DS_Real_gfx6_gfx7_gfx10<0x087, DS_MIN_SRC2_U32, DS_MIN_SRC2_U32>;
+defm DS_MAX_SRC2_U32        : DS_Real_gfx6_gfx7_gfx10<0x088, DS_MAX_SRC2_U32, DS_MAX_SRC2_U32>;
+defm DS_AND_SRC2_B32        : DS_Real_gfx6_gfx7_gfx10<0x089, DS_AND_SRC2_B32, DS_AND_SRC2_B32>;
+defm DS_OR_SRC2_B32         : DS_Real_gfx6_gfx7_gfx10<0x08a, DS_OR_SRC2_B32, DS_OR_SRC2_B32>;
+defm DS_XOR_SRC2_B32        : DS_Real_gfx6_gfx7_gfx10<0x08b, DS_XOR_SRC2_B32, DS_XOR_SRC2_B32>;
+defm DS_WRITE_SRC2_B32      : DS_Real_gfx6_gfx7_gfx10<0x08d, DS_WRITE_SRC2_B32, DS_WRITE_SRC2_B32>;
+defm DS_MIN_SRC2_F32        : DS_Real_gfx6_gfx7_gfx10<0x092, DS_MIN_SRC2_F32, DS_MIN_SRC2_F32>;
+defm DS_MAX_SRC2_F32        : DS_Real_gfx6_gfx7_gfx10<0x093, DS_MAX_SRC2_F32, DS_MAX_SRC2_F32>;
+defm DS_ADD_SRC2_U64        : DS_Real_gfx6_gfx7_gfx10<0x0c0, DS_ADD_SRC2_U64, DS_ADD_SRC2_U64>;
+defm DS_SUB_SRC2_U64        : DS_Real_gfx6_gfx7_gfx10<0x0c1, DS_SUB_SRC2_U64, DS_SUB_SRC2_U64>;
+defm DS_RSUB_SRC2_U64       : DS_Real_gfx6_gfx7_gfx10<0x0c2, DS_RSUB_SRC2_U64, DS_RSUB_SRC2_U64>;
+defm DS_INC_SRC2_U64        : DS_Real_gfx6_gfx7_gfx10<0x0c3, DS_INC_SRC2_U64, DS_INC_SRC2_U64>;
+defm DS_DEC_SRC2_U64        : DS_Real_gfx6_gfx7_gfx10<0x0c4, DS_DEC_SRC2_U64, DS_DEC_SRC2_U64>;
+defm DS_MIN_SRC2_I64        : DS_Real_gfx6_gfx7_gfx10<0x0c5, DS_MIN_SRC2_I64, DS_MIN_SRC2_I64>;
+defm DS_MAX_SRC2_I64        : DS_Real_gfx6_gfx7_gfx10<0x0c6, DS_MAX_SRC2_I64, DS_MAX_SRC2_I64>;
+defm DS_MIN_SRC2_U64        : DS_Real_gfx6_gfx7_gfx10<0x0c7, DS_MIN_SRC2_U64, DS_MIN_SRC2_U64>;
+defm DS_MAX_SRC2_U64        : DS_Real_gfx6_gfx7_gfx10<0x0c8, DS_MAX_SRC2_U64, DS_MAX_SRC2_U64>;
+defm DS_AND_SRC2_B64        : DS_Real_gfx6_gfx7_gfx10<0x0c9, DS_AND_SRC2_B64, DS_AND_SRC2_B64>;
+defm DS_OR_SRC2_B64         : DS_Real_gfx6_gfx7_gfx10<0x0ca, DS_OR_SRC2_B64, DS_OR_SRC2_B64>;
+defm DS_XOR_SRC2_B64        : DS_Real_gfx6_gfx7_gfx10<0x0cb, DS_XOR_SRC2_B64, DS_XOR_SRC2_B64>;
+defm DS_WRITE_SRC2_B64      : DS_Real_gfx6_gfx7_gfx10<0x0cd, DS_WRITE_SRC2_B64, DS_WRITE_SRC2_B64>;
+defm DS_MIN_SRC2_F64        : DS_Real_gfx6_gfx7_gfx10<0x0d2, DS_MIN_SRC2_F64, DS_MIN_SRC2_F64>;
+defm DS_MAX_SRC2_F64        : DS_Real_gfx6_gfx7_gfx10<0x0d3, DS_MAX_SRC2_F64, DS_MAX_SRC2_F64>;
 
 //===----------------------------------------------------------------------===//
 // GFX8, GFX9 (VI).
 //===----------------------------------------------------------------------===//
 
-class DS_Real_vi <bits<8> op, DS_Pseudo ps> :
+class DS_Real_Base_vi <bits<8> op, DS_Pseudo ps> :
   DS_Real <ps>,
   SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> {
   let AssemblerPredicate = isGFX8GFX9;
@@ -1749,181 +1835,210 @@ class DS_Real_vi <bits<8> op, DS_Pseudo ps> :
   let Inst{63-56} = !if(ps.has_vdst, vdst{7-0}, 0);
 }
 
-def DS_ADD_U32_vi         : DS_Real_vi<0x0,  DS_ADD_U32>;
-def DS_SUB_U32_vi         : DS_Real_vi<0x1,  DS_SUB_U32>;
-def DS_RSUB_U32_vi        : DS_Real_vi<0x2,  DS_RSUB_U32>;
-def DS_INC_U32_vi         : DS_Real_vi<0x3,  DS_INC_U32>;
-def DS_DEC_U32_vi         : DS_Real_vi<0x4,  DS_DEC_U32>;
-def DS_MIN_I32_vi         : DS_Real_vi<0x5,  DS_MIN_I32>;
-def DS_MAX_I32_vi         : DS_Real_vi<0x6,  DS_MAX_I32>;
-def DS_MIN_U32_vi         : DS_Real_vi<0x7,  DS_MIN_U32>;
-def DS_MAX_U32_vi         : DS_Real_vi<0x8,  DS_MAX_U32>;
-def DS_AND_B32_vi         : DS_Real_vi<0x9,  DS_AND_B32>;
-def DS_OR_B32_vi          : DS_Real_vi<0xa,  DS_OR_B32>;
-def DS_XOR_B32_vi         : DS_Real_vi<0xb,  DS_XOR_B32>;
-def DS_MSKOR_B32_vi       : DS_Real_vi<0xc,  DS_MSKOR_B32>;
-def DS_WRITE_B32_vi       : DS_Real_vi<0xd,  DS_WRITE_B32>;
-def DS_WRITE2_B32_vi      : DS_Real_vi<0xe,  DS_WRITE2_B32>;
-def DS_WRITE2ST64_B32_vi  : DS_Real_vi<0xf,  DS_WRITE2ST64_B32>;
-def DS_CMPST_B32_vi       : DS_Real_vi<0x10, DS_CMPST_B32>;
-def DS_CMPST_F32_vi       : DS_Real_vi<0x11, DS_CMPST_F32>;
-def DS_MIN_F32_vi         : DS_Real_vi<0x12, DS_MIN_F32>;
-def DS_MAX_F32_vi         : DS_Real_vi<0x13, DS_MAX_F32>;
-def DS_NOP_vi             : DS_Real_vi<0x14, DS_NOP>;
-def DS_ADD_F32_vi         : DS_Real_vi<0x15, DS_ADD_F32>;
-def DS_GWS_INIT_vi        : DS_Real_vi<0x99, DS_GWS_INIT>;
-def DS_GWS_SEMA_V_vi      : DS_Real_vi<0x9a, DS_GWS_SEMA_V>;
-def DS_GWS_SEMA_BR_vi     : DS_Real_vi<0x9b, DS_GWS_SEMA_BR>;
-def DS_GWS_SEMA_P_vi      : DS_Real_vi<0x9c, DS_GWS_SEMA_P>;
-def DS_GWS_BARRIER_vi     : DS_Real_vi<0x9d, DS_GWS_BARRIER>;
-def DS_WRITE_ADDTID_B32_vi : DS_Real_vi<0x1d, DS_WRITE_ADDTID_B32>;
-def DS_WRITE_B8_vi        : DS_Real_vi<0x1e, DS_WRITE_B8>;
-def DS_WRITE_B16_vi       : DS_Real_vi<0x1f, DS_WRITE_B16>;
-def DS_ADD_RTN_U32_vi     : DS_Real_vi<0x20, DS_ADD_RTN_U32>;
-def DS_SUB_RTN_U32_vi     : DS_Real_vi<0x21, DS_SUB_RTN_U32>;
-def DS_RSUB_RTN_U32_vi    : DS_Real_vi<0x22, DS_RSUB_RTN_U32>;
-def DS_INC_RTN_U32_vi     : DS_Real_vi<0x23, DS_INC_RTN_U32>;
-def DS_DEC_RTN_U32_vi     : DS_Real_vi<0x24, DS_DEC_RTN_U32>;
-def DS_MIN_RTN_I32_vi     : DS_Real_vi<0x25, DS_MIN_RTN_I32>;
-def DS_MAX_RTN_I32_vi     : DS_Real_vi<0x26, DS_MAX_RTN_I32>;
-def DS_MIN_RTN_U32_vi     : DS_Real_vi<0x27, DS_MIN_RTN_U32>;
-def DS_MAX_RTN_U32_vi     : DS_Real_vi<0x28, DS_MAX_RTN_U32>;
-def DS_AND_RTN_B32_vi     : DS_Real_vi<0x29, DS_AND_RTN_B32>;
-def DS_OR_RTN_B32_vi      : DS_Real_vi<0x2a, DS_OR_RTN_B32>;
-def DS_XOR_RTN_B32_vi     : DS_Real_vi<0x2b, DS_XOR_RTN_B32>;
-def DS_MSKOR_RTN_B32_vi   : DS_Real_vi<0x2c, DS_MSKOR_RTN_B32>;
-def DS_WRXCHG_RTN_B32_vi  : DS_Real_vi<0x2d, DS_WRXCHG_RTN_B32>;
-def DS_WRXCHG2_RTN_B32_vi : DS_Real_vi<0x2e, DS_WRXCHG2_RTN_B32>;
-def DS_WRXCHG2ST64_RTN_B32_vi : DS_Real_vi<0x2f, DS_WRXCHG2ST64_RTN_B32>;
-def DS_CMPST_RTN_B32_vi   : DS_Real_vi<0x30, DS_CMPST_RTN_B32>;
-def DS_CMPST_RTN_F32_vi   : DS_Real_vi<0x31, DS_CMPST_RTN_F32>;
-def DS_MIN_RTN_F32_vi     : DS_Real_vi<0x32, DS_MIN_RTN_F32>;
-def DS_MAX_RTN_F32_vi     : DS_Real_vi<0x33, DS_MAX_RTN_F32>;
-def DS_WRAP_RTN_B32_vi    : DS_Real_vi<0x34, DS_WRAP_RTN_B32>;
-def DS_ADD_RTN_F32_vi     : DS_Real_vi<0x35, DS_ADD_RTN_F32>;
-def DS_READ_B32_vi        : DS_Real_vi<0x36, DS_READ_B32>;
-def DS_READ2_B32_vi       : DS_Real_vi<0x37, DS_READ2_B32>;
-def DS_READ2ST64_B32_vi   : DS_Real_vi<0x38, DS_READ2ST64_B32>;
-def DS_READ_I8_vi         : DS_Real_vi<0x39, DS_READ_I8>;
-def DS_READ_U8_vi         : DS_Real_vi<0x3a, DS_READ_U8>;
-def DS_READ_I16_vi        : DS_Real_vi<0x3b, DS_READ_I16>;
-def DS_READ_U16_vi        : DS_Real_vi<0x3c, DS_READ_U16>;
-def DS_READ_ADDTID_B32_vi : DS_Real_vi<0xb6, DS_READ_ADDTID_B32>;
-def DS_CONSUME_vi         : DS_Real_vi<0xbd, DS_CONSUME>;
-def DS_APPEND_vi          : DS_Real_vi<0xbe, DS_APPEND>;
-def DS_ORDERED_COUNT_vi   : DS_Real_vi<0xbf, DS_ORDERED_COUNT>;
-def DS_SWIZZLE_B32_vi     : DS_Real_vi<0x3d, DS_SWIZZLE_B32>;
-def DS_PERMUTE_B32_vi     : DS_Real_vi<0x3e, DS_PERMUTE_B32>;
-def DS_BPERMUTE_B32_vi    : DS_Real_vi<0x3f, DS_BPERMUTE_B32>;
-
-def DS_ADD_U64_vi         : DS_Real_vi<0x40, DS_ADD_U64>;
-def DS_SUB_U64_vi         : DS_Real_vi<0x41, DS_SUB_U64>;
-def DS_RSUB_U64_vi        : DS_Real_vi<0x42, DS_RSUB_U64>;
-def DS_INC_U64_vi         : DS_Real_vi<0x43, DS_INC_U64>;
-def DS_DEC_U64_vi         : DS_Real_vi<0x44, DS_DEC_U64>;
-def DS_MIN_I64_vi         : DS_Real_vi<0x45, DS_MIN_I64>;
-def DS_MAX_I64_vi         : DS_Real_vi<0x46, DS_MAX_I64>;
-def DS_MIN_U64_vi         : DS_Real_vi<0x47, DS_MIN_U64>;
-def DS_MAX_U64_vi         : DS_Real_vi<0x48, DS_MAX_U64>;
-def DS_AND_B64_vi         : DS_Real_vi<0x49, DS_AND_B64>;
-def DS_OR_B64_vi          : DS_Real_vi<0x4a, DS_OR_B64>;
-def DS_XOR_B64_vi         : DS_Real_vi<0x4b, DS_XOR_B64>;
-def DS_MSKOR_B64_vi       : DS_Real_vi<0x4c, DS_MSKOR_B64>;
-def DS_WRITE_B64_vi       : DS_Real_vi<0x4d, DS_WRITE_B64>;
-def DS_WRITE2_B64_vi      : DS_Real_vi<0x4E, DS_WRITE2_B64>;
-def DS_WRITE2ST64_B64_vi  : DS_Real_vi<0x4f, DS_WRITE2ST64_B64>;
-def DS_CMPST_B64_vi       : DS_Real_vi<0x50, DS_CMPST_B64>;
-def DS_CMPST_F64_vi       : DS_Real_vi<0x51, DS_CMPST_F64>;
-def DS_MIN_F64_vi         : DS_Real_vi<0x52, DS_MIN_F64>;
-def DS_MAX_F64_vi         : DS_Real_vi<0x53, DS_MAX_F64>;
-
-def DS_WRITE_B8_D16_HI_vi  : DS_Real_vi<0x54, DS_WRITE_B8_D16_HI>;
-def DS_WRITE_B16_D16_HI_vi : DS_Real_vi<0x55, DS_WRITE_B16_D16_HI>;
-
-def DS_READ_U8_D16_vi     : DS_Real_vi<0x56, DS_READ_U8_D16>;
-def DS_READ_U8_D16_HI_vi  : DS_Real_vi<0x57, DS_READ_U8_D16_HI>;
-def DS_READ_I8_D16_vi     : DS_Real_vi<0x58, DS_READ_I8_D16>;
-def DS_READ_I8_D16_HI_vi  : DS_Real_vi<0x59, DS_READ_I8_D16_HI>;
-def DS_READ_U16_D16_vi    : DS_Real_vi<0x5a, DS_READ_U16_D16>;
-def DS_READ_U16_D16_HI_vi : DS_Real_vi<0x5b, DS_READ_U16_D16_HI>;
-
-def DS_ADD_RTN_U64_vi     : DS_Real_vi<0x60, DS_ADD_RTN_U64>;
-def DS_SUB_RTN_U64_vi     : DS_Real_vi<0x61, DS_SUB_RTN_U64>;
-def DS_RSUB_RTN_U64_vi    : DS_Real_vi<0x62, DS_RSUB_RTN_U64>;
-def DS_INC_RTN_U64_vi     : DS_Real_vi<0x63, DS_INC_RTN_U64>;
-def DS_DEC_RTN_U64_vi     : DS_Real_vi<0x64, DS_DEC_RTN_U64>;
-def DS_MIN_RTN_I64_vi     : DS_Real_vi<0x65, DS_MIN_RTN_I64>;
-def DS_MAX_RTN_I64_vi     : DS_Real_vi<0x66, DS_MAX_RTN_I64>;
-def DS_MIN_RTN_U64_vi     : DS_Real_vi<0x67, DS_MIN_RTN_U64>;
-def DS_MAX_RTN_U64_vi     : DS_Real_vi<0x68, DS_MAX_RTN_U64>;
-def DS_AND_RTN_B64_vi     : DS_Real_vi<0x69, DS_AND_RTN_B64>;
-def DS_OR_RTN_B64_vi      : DS_Real_vi<0x6a, DS_OR_RTN_B64>;
-def DS_XOR_RTN_B64_vi     : DS_Real_vi<0x6b, DS_XOR_RTN_B64>;
-def DS_MSKOR_RTN_B64_vi   : DS_Real_vi<0x6c, DS_MSKOR_RTN_B64>;
-def DS_WRXCHG_RTN_B64_vi  : DS_Real_vi<0x6d, DS_WRXCHG_RTN_B64>;
-def DS_WRXCHG2_RTN_B64_vi : DS_Real_vi<0x6e, DS_WRXCHG2_RTN_B64>;
-def DS_WRXCHG2ST64_RTN_B64_vi : DS_Real_vi<0x6f, DS_WRXCHG2ST64_RTN_B64>;
-def DS_CONDXCHG32_RTN_B64_vi   : DS_Real_vi<0x7e, DS_CONDXCHG32_RTN_B64>;
-def DS_GWS_SEMA_RELEASE_ALL_vi : DS_Real_vi<0x98, DS_GWS_SEMA_RELEASE_ALL>;
-def DS_CMPST_RTN_B64_vi   : DS_Real_vi<0x70, DS_CMPST_RTN_B64>;
-def DS_CMPST_RTN_F64_vi   : DS_Real_vi<0x71, DS_CMPST_RTN_F64>;
-def DS_MIN_RTN_F64_vi     : DS_Real_vi<0x72, DS_MIN_RTN_F64>;
-def DS_MAX_RTN_F64_vi     : DS_Real_vi<0x73, DS_MAX_RTN_F64>;
-
-def DS_READ_B64_vi        : DS_Real_vi<0x76, DS_READ_B64>;
-def DS_READ2_B64_vi       : DS_Real_vi<0x77, DS_READ2_B64>;
-def DS_READ2ST64_B64_vi   : DS_Real_vi<0x78, DS_READ2ST64_B64>;
-
-def DS_ADD_SRC2_U32_vi    : DS_Real_vi<0x80, DS_ADD_SRC2_U32>;
-def DS_SUB_SRC2_U32_vi    : DS_Real_vi<0x81, DS_SUB_SRC2_U32>;
-def DS_RSUB_SRC2_U32_vi   : DS_Real_vi<0x82, DS_RSUB_SRC2_U32>;
-def DS_INC_SRC2_U32_vi    : DS_Real_vi<0x83, DS_INC_SRC2_U32>;
-def DS_DEC_SRC2_U32_vi    : DS_Real_vi<0x84, DS_DEC_SRC2_U32>;
-def DS_MIN_SRC2_I32_vi    : DS_Real_vi<0x85, DS_MIN_SRC2_I32>;
-def DS_MAX_SRC2_I32_vi    : DS_Real_vi<0x86, DS_MAX_SRC2_I32>;
-def DS_MIN_SRC2_U32_vi    : DS_Real_vi<0x87, DS_MIN_SRC2_U32>;
-def DS_MAX_SRC2_U32_vi    : DS_Real_vi<0x88, DS_MAX_SRC2_U32>;
-def DS_AND_SRC2_B32_vi    : DS_Real_vi<0x89, DS_AND_SRC2_B32>;
-def DS_OR_SRC2_B32_vi     : DS_Real_vi<0x8a, DS_OR_SRC2_B32>;
-def DS_XOR_SRC2_B32_vi    : DS_Real_vi<0x8b, DS_XOR_SRC2_B32>;
-def DS_WRITE_SRC2_B32_vi  : DS_Real_vi<0x8d, DS_WRITE_SRC2_B32>;
-def DS_MIN_SRC2_F32_vi    : DS_Real_vi<0x92, DS_MIN_SRC2_F32>;
-def DS_MAX_SRC2_F32_vi    : DS_Real_vi<0x93, DS_MAX_SRC2_F32>;
-def DS_ADD_SRC2_F32_vi    : DS_Real_vi<0x95, DS_ADD_SRC2_F32>;
-def DS_ADD_SRC2_U64_vi    : DS_Real_vi<0xc0, DS_ADD_SRC2_U64>;
-def DS_SUB_SRC2_U64_vi    : DS_Real_vi<0xc1, DS_SUB_SRC2_U64>;
-def DS_RSUB_SRC2_U64_vi   : DS_Real_vi<0xc2, DS_RSUB_SRC2_U64>;
-def DS_INC_SRC2_U64_vi    : DS_Real_vi<0xc3, DS_INC_SRC2_U64>;
-def DS_DEC_SRC2_U64_vi    : DS_Real_vi<0xc4, DS_DEC_SRC2_U64>;
-def DS_MIN_SRC2_I64_vi    : DS_Real_vi<0xc5, DS_MIN_SRC2_I64>;
-def DS_MAX_SRC2_I64_vi    : DS_Real_vi<0xc6, DS_MAX_SRC2_I64>;
-def DS_MIN_SRC2_U64_vi    : DS_Real_vi<0xc7, DS_MIN_SRC2_U64>;
-def DS_MAX_SRC2_U64_vi    : DS_Real_vi<0xc8, DS_MAX_SRC2_U64>;
-def DS_AND_SRC2_B64_vi    : DS_Real_vi<0xc9, DS_AND_SRC2_B64>;
-def DS_OR_SRC2_B64_vi     : DS_Real_vi<0xca, DS_OR_SRC2_B64>;
-def DS_XOR_SRC2_B64_vi    : DS_Real_vi<0xcb, DS_XOR_SRC2_B64>;
-def DS_WRITE_SRC2_B64_vi  : DS_Real_vi<0xcd, DS_WRITE_SRC2_B64>;
-def DS_MIN_SRC2_F64_vi    : DS_Real_vi<0xd2, DS_MIN_SRC2_F64>;
-def DS_MAX_SRC2_F64_vi    : DS_Real_vi<0xd3, DS_MAX_SRC2_F64>;
-def DS_WRITE_B96_vi       : DS_Real_vi<0xde, DS_WRITE_B96>;
-def DS_WRITE_B128_vi      : DS_Real_vi<0xdf, DS_WRITE_B128>;
-def DS_READ_B96_vi        : DS_Real_vi<0xfe, DS_READ_B96>;
-def DS_READ_B128_vi       : DS_Real_vi<0xff, DS_READ_B128>;
+
+multiclass DS_Real_vi <bits<8> op, DS_Pseudo base_pseudo, bit need_gfx9_suffix = true> {
+  def "" : DS_Real_Base_vi<op, base_pseudo>;
+
+  if need_gfx9_suffix then {
+    def _gfx9 : DS_Real_Base_vi<op, !cast<DS_Pseudo>(!cast<string>(base_pseudo)#"_gfx9")> {
+      let DecoderNamespace = "GFX9";
+    }
+  }
+
+  // Handle cases that are available in all-AGPR or all-VGPR data
+  // operand forms. This should be used for all DS instructions with 2
+  // data operands.
+  defvar agpr_suffixed_name = !cast<string>(base_pseudo)#"_agpr";
+
+  if !exists<DS_Pseudo>(agpr_suffixed_name) then {
+    def _agpr : DS_Real_Base_vi<op, !cast<DS_Pseudo>(agpr_suffixed_name)> {
+      let DecoderNamespace = "GFX9";
+      let AssemblerPredicate = isGFX90APlus;
+    }
+  }
+}
+
+// Instructions which use m0 or not for both gfx8 and gfx9 (or did not
+// exist on gfx8)
+multiclass DS_Real_m0_vi<bits<8> op, DS_Pseudo ps> : DS_Real_vi<op, ps, false>;
+
+defm DS_ADD_U32_vi        : DS_Real_vi<0x0,  DS_ADD_U32>;
+defm DS_SUB_U32_vi        : DS_Real_vi<0x1,  DS_SUB_U32>;
+defm DS_RSUB_U32_vi       : DS_Real_vi<0x2,  DS_RSUB_U32>;
+defm DS_INC_U32_vi        : DS_Real_vi<0x3,  DS_INC_U32>;
+defm DS_DEC_U32_vi        : DS_Real_vi<0x4,  DS_DEC_U32>;
+defm DS_MIN_I32_vi        : DS_Real_vi<0x5,  DS_MIN_I32>;
+defm DS_MAX_I32_vi        : DS_Real_vi<0x6,  DS_MAX_I32>;
+defm DS_MIN_U32_vi        : DS_Real_vi<0x7,  DS_MIN_U32>;
+defm DS_MAX_U32_vi        : DS_Real_vi<0x8,  DS_MAX_U32>;
+defm DS_AND_B32_vi        : DS_Real_vi<0x9,  DS_AND_B32>;
+defm DS_OR_B32_vi         : DS_Real_vi<0xa,  DS_OR_B32>;
+defm DS_XOR_B32_vi        : DS_Real_vi<0xb,  DS_XOR_B32>;
+defm DS_MSKOR_B32_vi      : DS_Real_vi<0xc,  DS_MSKOR_B32>;
+defm DS_WRITE_B32_vi      : DS_Real_vi<0xd,  DS_WRITE_B32>;
+defm DS_WRITE2_B32_vi     : DS_Real_vi<0xe,  DS_WRITE2_B32>;
+defm DS_WRITE2ST64_B32_vi : DS_Real_vi<0xf,  DS_WRITE2ST64_B32>;
+
+defm DS_CMPST_B32_vi      : DS_Real_vi<0x10, DS_CMPST_B32>;
+defm DS_CMPST_F32_vi      : DS_Real_vi<0x11, DS_CMPST_F32>;
+defm DS_MIN_F32_vi        : DS_Real_vi<0x12, DS_MIN_F32>;
+defm DS_MAX_F32_vi        : DS_Real_vi<0x13, DS_MAX_F32>;
+defm DS_NOP_vi            : DS_Real_m0_vi<0x14, DS_NOP>;
+defm DS_ADD_F32_vi        : DS_Real_vi<0x15, DS_ADD_F32>;
+defm DS_GWS_INIT_vi       : DS_Real_m0_vi<0x99, DS_GWS_INIT>;
+defm DS_GWS_SEMA_V_vi     : DS_Real_m0_vi<0x9a, DS_GWS_SEMA_V>;
+defm DS_GWS_SEMA_BR_vi    : DS_Real_m0_vi<0x9b, DS_GWS_SEMA_BR>;
+defm DS_GWS_SEMA_P_vi     : DS_Real_m0_vi<0x9c, DS_GWS_SEMA_P>;
+defm DS_GWS_BARRIER_vi    : DS_Real_m0_vi<0x9d, DS_GWS_BARRIER>;
+defm DS_WRITE_ADDTID_B32_vi: DS_Real_m0_vi<0x1d, DS_WRITE_ADDTID_B32>;
+defm DS_WRITE_B8_vi       : DS_Real_vi<0x1e, DS_WRITE_B8>;
+defm DS_WRITE_B16_vi      : DS_Real_vi<0x1f, DS_WRITE_B16>;
+defm DS_ADD_RTN_U32_vi    : DS_Real_vi<0x20, DS_ADD_RTN_U32>;
+defm DS_SUB_RTN_U32_vi    : DS_Real_vi<0x21, DS_SUB_RTN_U32>;
+defm DS_RSUB_RTN_U32_vi   : DS_Real_vi<0x22, DS_RSUB_RTN_U32>;
+defm DS_INC_RTN_U32_vi    : DS_Real_vi<0x23, DS_INC_RTN_U32>;
+defm DS_DEC_RTN_U32_vi    : DS_Real_vi<0x24, DS_DEC_RTN_U32>;
+defm DS_MIN_RTN_I32_vi    : DS_Real_vi<0x25, DS_MIN_RTN_I32>;
+defm DS_MAX_RTN_I32_vi    : DS_Real_vi<0x26, DS_MAX_RTN_I32>;
+defm DS_MIN_RTN_U32_vi    : DS_Real_vi<0x27, DS_MIN_RTN_U32>;
+defm DS_MAX_RTN_U32_vi    : DS_Real_vi<0x28, DS_MAX_RTN_U32>;
+defm DS_AND_RTN_B32_vi    : DS_Real_vi<0x29, DS_AND_RTN_B32>;
+defm DS_OR_RTN_B32_vi     : DS_Real_vi<0x2a, DS_OR_RTN_B32>;
+defm DS_XOR_RTN_B32_vi    : DS_Real_vi<0x2b, DS_XOR_RTN_B32>;
+defm DS_MSKOR_RTN_B32_vi  : DS_Real_vi<0x2c, DS_MSKOR_RTN_B32>;
+defm DS_WRXCHG_RTN_B32_vi : DS_Real_vi<0x2d, DS_WRXCHG_RTN_B32>;
+defm DS_WRXCHG2_RTN_B32_vi : DS_Real_vi<0x2e, DS_WRXCHG2_RTN_B32>;
+defm DS_WRXCHG2ST64_RTN_B32_vi : DS_Real_vi<0x2f, DS_WRXCHG2ST64_RTN_B32>;
+defm DS_CMPST_RTN_B32_vi  : DS_Real_vi<0x30, DS_CMPST_RTN_B32>;
+defm DS_CMPST_RTN_F32_vi  : DS_Real_vi<0x31, DS_CMPST_RTN_F32>;
+defm DS_MIN_RTN_F32_vi    : DS_Real_vi<0x32, DS_MIN_RTN_F32>;
+defm DS_MAX_RTN_F32_vi    : DS_Real_vi<0x33, DS_MAX_RTN_F32>;
+defm DS_WRAP_RTN_B32_vi   : DS_Real_vi<0x34, DS_WRAP_RTN_B32>;
+defm DS_ADD_RTN_F32_vi    : DS_Real_vi<0x35, DS_ADD_RTN_F32>;
+defm DS_READ_B32_vi       : DS_Real_vi<0x36, DS_READ_B32>;
+defm DS_READ2_B32_vi      : DS_Real_vi<0x37, DS_READ2_B32>;
+defm DS_READ2ST64_B32_vi  : DS_Real_vi<0x38, DS_READ2ST64_B32>;
+defm DS_READ_I8_vi        : DS_Real_vi<0x39, DS_READ_I8>;
+defm DS_READ_U8_vi        : DS_Real_vi<0x3a, DS_READ_U8>;
+defm DS_READ_I16_vi       : DS_Real_vi<0x3b, DS_READ_I16>;
+defm DS_READ_U16_vi       : DS_Real_vi<0x3c, DS_READ_U16>;
+defm DS_READ_ADDTID_B32_vi : DS_Real_m0_vi<0xb6, DS_READ_ADDTID_B32>;
+defm DS_CONSUME_vi         : DS_Real_m0_vi<0xbd, DS_CONSUME>;
+defm DS_APPEND_vi          : DS_Real_m0_vi<0xbe, DS_APPEND>;
+defm DS_ORDERED_COUNT_vi   : DS_Real_m0_vi<0xbf, DS_ORDERED_COUNT>;
+defm DS_SWIZZLE_B32_vi     : DS_Real_m0_vi<0x3d, DS_SWIZZLE_B32>;
+defm DS_PERMUTE_B32_vi    : DS_Real_m0_vi<0x3e, DS_PERMUTE_B32>;
+defm DS_BPERMUTE_B32_vi   : DS_Real_m0_vi<0x3f, DS_BPERMUTE_B32>;
+
+defm DS_ADD_U64_vi        : DS_Real_vi<0x40, DS_ADD_U64>;
+defm DS_SUB_U64_vi        : DS_Real_vi<0x41, DS_SUB_U64>;
+defm DS_RSUB_U64_vi       : DS_Real_vi<0x42, DS_RSUB_U64>;
+defm DS_INC_U64_vi        : DS_Real_vi<0x43, DS_INC_U64>;
+defm DS_DEC_U64_vi        : DS_Real_vi<0x44, DS_DEC_U64>;
+defm DS_MIN_I64_vi        : DS_Real_vi<0x45, DS_MIN_I64>;
+defm DS_MAX_I64_vi        : DS_Real_vi<0x46, DS_MAX_I64>;
+defm DS_MIN_U64_vi        : DS_Real_vi<0x47, DS_MIN_U64>;
+defm DS_MAX_U64_vi        : DS_Real_vi<0x48, DS_MAX_U64>;
+defm DS_AND_B64_vi        : DS_Real_vi<0x49, DS_AND_B64>;
+defm DS_OR_B64_vi         : DS_Real_vi<0x4a, DS_OR_B64>;
+defm DS_XOR_B64_vi        : DS_Real_vi<0x4b, DS_XOR_B64>;
+defm DS_MSKOR_B64_vi      : DS_Real_vi<0x4c, DS_MSKOR_B64>;
+defm DS_WRITE_B64_vi      : DS_Real_vi<0x4d, DS_WRITE_B64>;
+defm DS_WRITE2_B64_vi     : DS_Real_vi<0x4E, DS_WRITE2_B64>;
+defm DS_WRITE2ST64_B64_vi : DS_Real_vi<0x4f, DS_WRITE2ST64_B64>;
+
+defm DS_CMPST_B64_vi      : DS_Real_vi<0x50, DS_CMPST_B64>;
+defm DS_CMPST_F64_vi      : DS_Real_vi<0x51, DS_CMPST_F64>;
+defm DS_MIN_F64_vi        : DS_Real_vi<0x52, DS_MIN_F64>;
+defm DS_MAX_F64_vi        : DS_Real_vi<0x53, DS_MAX_F64>;
+
+defm DS_WRITE_B8_D16_HI_vi : DS_Real_m0_vi<0x54, DS_WRITE_B8_D16_HI>;
+defm DS_WRITE_B16_D16_HI_vi: DS_Real_m0_vi<0x55, DS_WRITE_B16_D16_HI>;
+
+defm DS_READ_U8_D16_vi    : DS_Real_m0_vi<0x56, DS_READ_U8_D16>;
+defm DS_READ_U8_D16_HI_vi : DS_Real_m0_vi<0x57, DS_READ_U8_D16_HI>;
+defm DS_READ_I8_D16_vi    : DS_Real_m0_vi<0x58, DS_READ_I8_D16>;
+defm DS_READ_I8_D16_HI_vi : DS_Real_m0_vi<0x59, DS_READ_I8_D16_HI>;
+defm DS_READ_U16_D16_vi   : DS_Real_m0_vi<0x5a, DS_READ_U16_D16>;
+defm DS_READ_U16_D16_HI_vi: DS_Real_m0_vi<0x5b, DS_READ_U16_D16_HI>;
+
+defm DS_ADD_RTN_U64_vi    : DS_Real_vi<0x60, DS_ADD_RTN_U64>;
+defm DS_SUB_RTN_U64_vi    : DS_Real_vi<0x61, DS_SUB_RTN_U64>;
+defm DS_RSUB_RTN_U64_vi   : DS_Real_vi<0x62, DS_RSUB_RTN_U64>;
+defm DS_INC_RTN_U64_vi    : DS_Real_vi<0x63, DS_INC_RTN_U64>;
+defm DS_DEC_RTN_U64_vi    : DS_Real_vi<0x64, DS_DEC_RTN_U64>;
+defm DS_MIN_RTN_I64_vi    : DS_Real_vi<0x65, DS_MIN_RTN_I64>;
+defm DS_MAX_RTN_I64_vi    : DS_Real_vi<0x66, DS_MAX_RTN_I64>;
+defm DS_MIN_RTN_U64_vi    : DS_Real_vi<0x67, DS_MIN_RTN_U64>;
+defm DS_MAX_RTN_U64_vi    : DS_Real_vi<0x68, DS_MAX_RTN_U64>;
+defm DS_AND_RTN_B64_vi    : DS_Real_vi<0x69, DS_AND_RTN_B64>;
+defm DS_OR_RTN_B64_vi     : DS_Real_vi<0x6a, DS_OR_RTN_B64>;
+defm DS_XOR_RTN_B64_vi    : DS_Real_vi<0x6b, DS_XOR_RTN_B64>;
+defm DS_MSKOR_RTN_B64_vi  : DS_Real_vi<0x6c, DS_MSKOR_RTN_B64>;
+defm DS_WRXCHG_RTN_B64_vi : DS_Real_vi<0x6d, DS_WRXCHG_RTN_B64>;
+defm DS_WRXCHG2_RTN_B64_vi : DS_Real_vi<0x6e, DS_WRXCHG2_RTN_B64>;
+defm DS_WRXCHG2ST64_RTN_B64_vi : DS_Real_vi<0x6f, DS_WRXCHG2ST64_RTN_B64>;
+defm DS_CONDXCHG32_RTN_B64_vi  : DS_Real_vi<0x7e, DS_CONDXCHG32_RTN_B64>;
+defm DS_GWS_SEMA_RELEASE_ALL_vi: DS_Real_m0_vi<0x98, DS_GWS_SEMA_RELEASE_ALL>;
+defm DS_CMPST_RTN_B64_vi  : DS_Real_vi<0x70, DS_CMPST_RTN_B64>;
+defm DS_CMPST_RTN_F64_vi  : DS_Real_vi<0x71, DS_CMPST_RTN_F64>;
+defm DS_MIN_RTN_F64_vi    : DS_Real_vi<0x72, DS_MIN_RTN_F64>;
+defm DS_MAX_RTN_F64_vi    : DS_Real_vi<0x73, DS_MAX_RTN_F64>;
+
+defm DS_READ_B64_vi       : DS_Real_vi<0x76, DS_READ_B64>;
+defm DS_READ2_B64_vi      : DS_Real_vi<0x77, DS_READ2_B64>;
+defm DS_READ2ST64_B64_vi  : DS_Real_vi<0x78, DS_READ2ST64_B64>;
+
+defm DS_ADD_SRC2_U32_vi   : DS_Real_m0_vi<0x80, DS_ADD_SRC2_U32>;
+defm DS_SUB_SRC2_U32_vi   : DS_Real_m0_vi<0x81, DS_SUB_SRC2_U32>;
+defm DS_RSUB_SRC2_U32_vi  : DS_Real_m0_vi<0x82, DS_RSUB_SRC2_U32>;
+defm DS_INC_SRC2_U32_vi   : DS_Real_m0_vi<0x83, DS_INC_SRC2_U32>;
+defm DS_DEC_SRC2_U32_vi   : DS_Real_m0_vi<0x84, DS_DEC_SRC2_U32>;
+defm DS_MIN_SRC2_I32_vi   : DS_Real_m0_vi<0x85, DS_MIN_SRC2_I32>;
+defm DS_MAX_SRC2_I32_vi   : DS_Real_m0_vi<0x86, DS_MAX_SRC2_I32>;
+defm DS_MIN_SRC2_U32_vi   : DS_Real_m0_vi<0x87, DS_MIN_SRC2_U32>;
+defm DS_MAX_SRC2_U32_vi   : DS_Real_m0_vi<0x88, DS_MAX_SRC2_U32>;
+defm DS_AND_SRC2_B32_vi   : DS_Real_m0_vi<0x89, DS_AND_SRC2_B32>;
+defm DS_OR_SRC2_B32_vi    : DS_Real_m0_vi<0x8a, DS_OR_SRC2_B32>;
+defm DS_XOR_SRC2_B32_vi   : DS_Real_m0_vi<0x8b, DS_XOR_SRC2_B32>;
+defm DS_WRITE_SRC2_B32_vi : DS_Real_m0_vi<0x8d, DS_WRITE_SRC2_B32>;
+defm DS_MIN_SRC2_F32_vi   : DS_Real_m0_vi<0x92, DS_MIN_SRC2_F32>;
+defm DS_MAX_SRC2_F32_vi   : DS_Real_m0_vi<0x93, DS_MAX_SRC2_F32>;
+defm DS_ADD_SRC2_F32_vi   : DS_Real_m0_vi<0x95, DS_ADD_SRC2_F32>;
+defm DS_ADD_SRC2_U64_vi   : DS_Real_m0_vi<0xc0, DS_ADD_SRC2_U64>;
+defm DS_SUB_SRC2_U64_vi   : DS_Real_m0_vi<0xc1, DS_SUB_SRC2_U64>;
+defm DS_RSUB_SRC2_U64_vi  : DS_Real_m0_vi<0xc2, DS_RSUB_SRC2_U64>;
+defm DS_INC_SRC2_U64_vi   : DS_Real_m0_vi<0xc3, DS_INC_SRC2_U64>;
+defm DS_DEC_SRC2_U64_vi   : DS_Real_m0_vi<0xc4, DS_DEC_SRC2_U64>;
+defm DS_MIN_SRC2_I64_vi   : DS_Real_m0_vi<0xc5, DS_MIN_SRC2_I64>;
+defm DS_MAX_SRC2_I64_vi   : DS_Real_m0_vi<0xc6, DS_MAX_SRC2_I64>;
+defm DS_MIN_SRC2_U64_vi   : DS_Real_m0_vi<0xc7, DS_MIN_SRC2_U64>;
+defm DS_MAX_SRC2_U64_vi   : DS_Real_m0_vi<0xc8, DS_MAX_SRC2_U64>;
+defm DS_AND_SRC2_B64_vi   : DS_Real_m0_vi<0xc9, DS_AND_SRC2_B64>;
+defm DS_OR_SRC2_B64_vi    : DS_Real_m0_vi<0xca, DS_OR_SRC2_B64>;
+defm DS_XOR_SRC2_B64_vi   : DS_Real_m0_vi<0xcb, DS_XOR_SRC2_B64>;
+defm DS_WRITE_SRC2_B64_vi : DS_Real_m0_vi<0xcd, DS_WRITE_SRC2_B64>;
+defm DS_MIN_SRC2_F64_vi   : DS_Real_m0_vi<0xd2, DS_MIN_SRC2_F64>;
+defm DS_MAX_SRC2_F64_vi   : DS_Real_m0_vi<0xd3, DS_MAX_SRC2_F64>;
+defm DS_WRITE_B96_vi      : DS_Real_vi<0xde, DS_WRITE_B96>;
+defm DS_WRITE_B128_vi     : DS_Real_vi<0xdf, DS_WRITE_B128>;
+defm DS_READ_B96_vi       : DS_Real_vi<0xfe, DS_READ_B96>;
+defm DS_READ_B128_vi      : DS_Real_vi<0xff, DS_READ_B128>;
 
 // GFX90A+.
-def DS_ADD_F64_vi     : DS_Real_vi<0x5c, DS_ADD_F64>;
-def DS_ADD_RTN_F64_vi : DS_Real_vi<0x7c, DS_ADD_RTN_F64>;
+defm DS_ADD_F64_vi    : DS_Real_m0_vi<0x5c, DS_ADD_F64>;
+defm DS_ADD_RTN_F64_vi: DS_Real_m0_vi<0x7c, DS_ADD_RTN_F64>;
 
 // GFX942+.
-def DS_PK_ADD_F16_vi     : DS_Real_vi<0x17, DS_PK_ADD_F16>;
-def DS_PK_ADD_RTN_F16_vi : DS_Real_vi<0xb7, DS_PK_ADD_RTN_F16>;
-def DS_PK_ADD_BF16_vi     : DS_Real_vi<0x18, DS_PK_ADD_BF16>;
-def DS_PK_ADD_RTN_BF16_vi : DS_Real_vi<0xb8, DS_PK_ADD_RTN_BF16>;
+defm DS_PK_ADD_F16_vi     : DS_Real_m0_vi<0x17, DS_PK_ADD_F16>;
+defm DS_PK_ADD_RTN_F16_vi : DS_Real_m0_vi<0xb7, DS_PK_ADD_RTN_F16>;
+defm DS_PK_ADD_BF16_vi    : DS_Real_m0_vi<0x18, DS_PK_ADD_BF16>;
+defm DS_PK_ADD_RTN_BF16_vi: DS_Real_m0_vi<0xb8, DS_PK_ADD_RTN_BF16>;
 
 //===----------------------------------------------------------------------===//
 // GFX950.
 //===----------------------------------------------------------------------===//
-def DS_READ_B64_TR_B4_vi  : DS_Real_vi<0x0e0, DS_READ_B64_TR_B4>;
-def DS_READ_B96_TR_B6_vi  : DS_Real_vi<0x0e1, DS_READ_B96_TR_B6>;
-def DS_READ_B64_TR_B8_vi  : DS_Real_vi<0x0e2, DS_READ_B64_TR_B8>;
-def DS_READ_B64_TR_B16_vi : DS_Real_vi<0x0e3, DS_READ_B64_TR_B16>;
+defm DS_READ_B64_TR_B4_vi : DS_Real_m0_vi<0x0e0, DS_READ_B64_TR_B4>;
+defm DS_READ_B96_TR_B6_vi : DS_Real_m0_vi<0x0e1, DS_READ_B96_TR_B6>;
+defm DS_READ_B64_TR_B8_vi : DS_Real_m0_vi<0x0e2, DS_READ_B64_TR_B8>;
+defm DS_READ_B64_TR_B16_vi: DS_Real_m0_vi<0x0e3, DS_READ_B64_TR_B16>;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 6a2beeed41df..6f6039bf4ec2 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -38,6 +38,7 @@
 #include "llvm/Support/Compiler.h"
 
 using namespace llvm;
+using namespace llvm::MCD;
 
 #define DEBUG_TYPE "amdgpu-disassembler"
 
@@ -446,6 +447,14 @@ static DecodeStatus decodeVersionImm(MCInst &Inst, unsigned Imm,
 
 #include "AMDGPUGenDisassemblerTables.inc"
 
+namespace {
+// Define bitwidths for various types used to instantiate the decoder.
+template <> constexpr uint32_t InsnBitWidth<uint32_t> = 32;
+template <> constexpr uint32_t InsnBitWidth<uint64_t> = 64;
+template <> constexpr uint32_t InsnBitWidth<std::bitset<96>> = 96;
+template <> constexpr uint32_t InsnBitWidth<std::bitset<128>> = 128;
+} // namespace
+
 //===----------------------------------------------------------------------===//
 //
 //===----------------------------------------------------------------------===//
@@ -498,26 +507,24 @@ template <typename T> static inline T eatBytes(ArrayRef<uint8_t>& Bytes) {
   return Res;
 }
 
-static inline DecoderUInt128 eat12Bytes(ArrayRef<uint8_t> &Bytes) {
+static inline std::bitset<96> eat12Bytes(ArrayRef<uint8_t> &Bytes) {
+  using namespace llvm::support::endian;
   assert(Bytes.size() >= 12);
-  uint64_t Lo =
-      support::endian::read<uint64_t, llvm::endianness::little>(Bytes.data());
+  std::bitset<96> Lo(read<uint64_t, endianness::little>(Bytes.data()));
   Bytes = Bytes.slice(8);
-  uint64_t Hi =
-      support::endian::read<uint32_t, llvm::endianness::little>(Bytes.data());
+  std::bitset<96> Hi(read<uint32_t, endianness::little>(Bytes.data()));
   Bytes = Bytes.slice(4);
-  return DecoderUInt128(Lo, Hi);
+  return (Hi << 64) | Lo;
 }
 
-static inline DecoderUInt128 eat16Bytes(ArrayRef<uint8_t> &Bytes) {
+static inline std::bitset<128> eat16Bytes(ArrayRef<uint8_t> &Bytes) {
+  using namespace llvm::support::endian;
   assert(Bytes.size() >= 16);
-  uint64_t Lo =
-      support::endian::read<uint64_t, llvm::endianness::little>(Bytes.data());
+  std::bitset<128> Lo(read<uint64_t, endianness::little>(Bytes.data()));
   Bytes = Bytes.slice(8);
-  uint64_t Hi =
-      support::endian::read<uint64_t, llvm::endianness::little>(Bytes.data());
+  std::bitset<128> Hi(read<uint64_t, endianness::little>(Bytes.data()));
   Bytes = Bytes.slice(8);
-  return DecoderUInt128(Lo, Hi);
+  return (Hi << 64) | Lo;
 }
 
 void AMDGPUDisassembler::decodeImmOperands(MCInst &MI,
@@ -600,14 +607,14 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     // Try to decode DPP and SDWA first to solve conflict with VOP1 and VOP2
     // encodings
     if (isGFX1250() && Bytes.size() >= 16) {
-      DecoderUInt128 DecW = eat16Bytes(Bytes);
+      std::bitset<128> DecW = eat16Bytes(Bytes);
       if (tryDecodeInst(DecoderTableGFX1250128, MI, DecW, Address, CS))
         break;
       Bytes = Bytes_.slice(0, MaxInstBytesNum);
     }
 
     if (isGFX11Plus() && Bytes.size() >= 12) {
-      DecoderUInt128 DecW = eat12Bytes(Bytes);
+      std::bitset<96> DecW = eat12Bytes(Bytes);
 
       if (isGFX11() &&
           tryDecodeInst(DecoderTableGFX1196, DecoderTableGFX11_FAKE1696, MI,
@@ -642,7 +649,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
 
     } else if (Bytes.size() >= 16 &&
                STI.hasFeature(AMDGPU::FeatureGFX950Insts)) {
-      DecoderUInt128 DecW = eat16Bytes(Bytes);
+      std::bitset<128> DecW = eat16Bytes(Bytes);
       if (tryDecodeInst(DecoderTableGFX940128, MI, DecW, Address, CS))
         break;
 
@@ -836,6 +843,18 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     }
   }
 
+  // Validate buffer instruction offsets for GFX12+ - must not be a negative.
+  if (isGFX12Plus() && isBufferInstruction(MI)) {
+    int OffsetIdx =
+        AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::offset);
+    if (OffsetIdx != -1) {
+      uint32_t Imm = MI.getOperand(OffsetIdx).getImm();
+      int64_t SignedOffset = SignExtend64<24>(Imm);
+      if (SignedOffset < 0)
+        return MCDisassembler::Fail;
+    }
+  }
+
   if (MCII->get(MI.getOpcode()).TSFlags &
       (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF)) {
     int SWZOpIdx =
@@ -1216,6 +1235,26 @@ void AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
   }
 }
 
+// Given a wide tuple \p Reg check if it will overflow 256 registers.
+// \returns \p Reg on success or NoRegister otherwise.
+static unsigned CheckVGPROverflow(unsigned Reg, const MCRegisterClass &RC,
+                                  const MCRegisterInfo &MRI) {
+  unsigned NumRegs = RC.getSizeInBits() / 32;
+  MCRegister Sub0 = MRI.getSubReg(Reg, AMDGPU::sub0);
+  if (!Sub0)
+    return Reg;
+
+  MCRegister BaseReg;
+  if (MRI.getRegClass(AMDGPU::VGPR_32RegClassID).contains(Sub0))
+    BaseReg = AMDGPU::VGPR0;
+  else if (MRI.getRegClass(AMDGPU::AGPR_32RegClassID).contains(Sub0))
+    BaseReg = AMDGPU::AGPR0;
+
+  assert(BaseReg && "Only vector registers expected");
+
+  return (Sub0 - BaseReg + NumRegs <= 256) ? Reg : AMDGPU::NoRegister;
+}
+
 // Note that before gfx10, the MIMG encoding provided no information about
 // VADDR size. Consequently, decoded instructions always show address as if it
 // has 1 dword, which could be not really so.
@@ -1320,8 +1359,9 @@ void AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
     MCRegister VdataSub0 = MRI.getSubReg(Vdata0, AMDGPU::sub0);
     Vdata0 = (VdataSub0 != 0)? VdataSub0 : Vdata0;
 
-    NewVdata = MRI.getMatchingSuperReg(Vdata0, AMDGPU::sub0,
-                                       &MRI.getRegClass(DataRCID));
+    const MCRegisterClass &NewRC = MRI.getRegClass(DataRCID);
+    NewVdata = MRI.getMatchingSuperReg(Vdata0, AMDGPU::sub0, &NewRC);
+    NewVdata = CheckVGPROverflow(NewVdata, NewRC, MRI);
     if (!NewVdata) {
       // It's possible to encode this such that the low register + enabled
       // components exceeds the register count.
@@ -1340,8 +1380,9 @@ void AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
     VAddrSA = VAddrSubSA ? VAddrSubSA : VAddrSA;
 
     auto AddrRCID = MCII->get(NewOpcode).operands()[VAddrSAIdx].RegClass;
-    NewVAddrSA = MRI.getMatchingSuperReg(VAddrSA, AMDGPU::sub0,
-                                        &MRI.getRegClass(AddrRCID));
+    const MCRegisterClass &NewRC = MRI.getRegClass(AddrRCID);
+    NewVAddrSA = MRI.getMatchingSuperReg(VAddrSA, AMDGPU::sub0, &NewRC);
+    NewVAddrSA = CheckVGPROverflow(NewVAddrSA, NewRC, MRI);
     if (!NewVAddrSA)
       return;
   }
@@ -2598,9 +2639,6 @@ Expected<bool> AMDGPUDisassembler::decodeKernelDescriptorDirective(
                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
     PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size",
                     KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
-    if (isGFX1250())
-      PRINT_DIRECTIVE(".amdhsa_uses_cu_stores",
-                      KERNEL_CODE_PROPERTY_USES_CU_STORES);
 
     if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED0)
       return createReservedKDBitsError(KERNEL_CODE_PROPERTY_RESERVED0,
@@ -2743,6 +2781,20 @@ const MCExpr *AMDGPUDisassembler::createConstantSymbolExpr(StringRef Id,
   return MCSymbolRefExpr::create(Sym, Ctx);
 }
 
+bool AMDGPUDisassembler::isBufferInstruction(const MCInst &MI) const {
+  const uint64_t TSFlags = MCII->get(MI.getOpcode()).TSFlags;
+
+  // Check for MUBUF and MTBUF instructions
+  if (TSFlags & (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF))
+    return true;
+
+  // Check for SMEM buffer instructions (S_BUFFER_* instructions)
+  if ((TSFlags & SIInstrFlags::SMRD) && AMDGPU::getSMEMIsBuffer(MI.getOpcode()))
+    return true;
+
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 // AMDGPUSymbolizer
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index f4d164bf10c3..c1131c2936fc 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -32,44 +32,6 @@ class MCOperand;
 class MCSubtargetInfo;
 class Twine;
 
-// Exposes an interface expected by autogenerated code in
-// FixedLenDecoderEmitter
-class DecoderUInt128 {
-private:
-  uint64_t Lo = 0;
-  uint64_t Hi = 0;
-
-public:
-  DecoderUInt128() = default;
-  DecoderUInt128(uint64_t Lo, uint64_t Hi = 0) : Lo(Lo), Hi(Hi) {}
-  operator bool() const { return Lo || Hi; }
-  uint64_t extractBitsAsZExtValue(unsigned NumBits,
-                                  unsigned BitPosition) const {
-    assert(NumBits && NumBits <= 64);
-    assert(BitPosition < 128);
-    uint64_t Val;
-    if (BitPosition < 64)
-      Val = Lo >> BitPosition | Hi << 1 << (63 - BitPosition);
-    else
-      Val = Hi >> (BitPosition - 64);
-    return Val & ((uint64_t(2) << (NumBits - 1)) - 1);
-  }
-  DecoderUInt128 operator&(const DecoderUInt128 &RHS) const {
-    return DecoderUInt128(Lo & RHS.Lo, Hi & RHS.Hi);
-  }
-  DecoderUInt128 operator&(const uint64_t &RHS) const {
-    return *this & DecoderUInt128(RHS);
-  }
-  DecoderUInt128 operator~() const { return DecoderUInt128(~Lo, ~Hi); }
-  bool operator==(const DecoderUInt128 &RHS) {
-    return Lo == RHS.Lo && Hi == RHS.Hi;
-  }
-  bool operator!=(const DecoderUInt128 &RHS) {
-    return Lo != RHS.Lo || Hi != RHS.Hi;
-  }
-  bool operator!=(const int &RHS) { return *this != DecoderUInt128(RHS); }
-};
-
 //===----------------------------------------------------------------------===//
 // AMDGPUDisassembler
 //===----------------------------------------------------------------------===//
@@ -223,6 +185,9 @@ public:
   bool hasKernargPreload() const;
 
   bool isMacDPP(MCInst &MI) const;
+
+  /// Check if the instruction is a buffer operation (MUBUF, MTBUF, or S_BUFFER)
+  bool isBufferInstruction(const MCInst &MI) const;
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
index 280def5440c8..dadc7dcd7054 100644
--- a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -626,7 +626,6 @@ class R600_LDS_1A <bits<6> lds_op, string name, list<dag> pattern> : R600_LDS <
 
   let usesCustomInserter = 1;
   let LDS_1A = 1;
-  let DisableEncoding = "$dst";
 }
 
 class R600_LDS_1A1D <bits<6> lds_op, dag outs, string name, list<dag> pattern,
@@ -658,7 +657,6 @@ class R600_LDS_1A1D_RET <bits<6> lds_op, string name, list<dag> pattern> :
 
   let BaseOp = name;
   let usesCustomInserter = 1;
-  let DisableEncoding = "$dst";
 }
 
 class R600_LDS_1A2D <bits<6> lds_op, dag outs, string name, list<dag> pattern,
@@ -688,7 +686,6 @@ class R600_LDS_1A2D_RET <bits<6> lds_op, string name, list<dag> pattern> :
 
   let BaseOp = name;
   let usesCustomInserter = 1;
-  let DisableEncoding = "$dst";
 }
 
 def LDS_ADD : R600_LDS_1A1D_NORET <0x0, "LDS_ADD", [] >;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index f5d438436b29..a1306565bbe2 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -12,9 +12,11 @@ let WantsRoot = true in {
   def ScratchOffset : ComplexPattern<iPTR, 2, "SelectScratchOffset", [], [], -10>;
 
   def GlobalSAddrNoIOffset : ComplexPattern<iPTR, 3, "SelectGlobalSAddrNoIOffset", [], [], -3>;
+  def GlobalSAddrNoIOffsetM0 : ComplexPattern<iPTR, 3, "SelectGlobalSAddrNoIOffsetM0", [], [], -3>;
   def GlobalSAddr : ComplexPattern<iPTR, 4, "SelectGlobalSAddr", [], [], -10>;
   def GlobalSAddrGLC : ComplexPattern<iPTR, 4, "SelectGlobalSAddrGLC", [], [], -10>;
   def GlobalSAddrCPol : ComplexPattern<iPTR, 4, "SelectGlobalSAddrCPol", [], [], -10>;
+  def GlobalSAddrCPolM0 : ComplexPattern<iPTR, 4, "SelectGlobalSAddrCPolM0", [], [], -10>;
   def ScratchSAddr : ComplexPattern<iPTR, 2, "SelectScratchSAddr", [], [], -10>;
   def ScratchSVAddr : ComplexPattern<iPTR, 4, "SelectScratchSVAddr", [], [], -10>;
 }
@@ -135,7 +137,18 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
   // unsigned for flat accesses.
   bits<13> offset;
   // GFX90A+ only: instruction uses AccVGPR for data
-  bits<1> acc = !if(ps.has_vdst, vdst{9}, !if(ps.has_data, vdata{9}, 0));
+  defvar DstOpIsAV = !if(ps.has_vdst,
+                         VDstOperandIsAV<ps.OutOperandList>.ret, 0);
+  defvar DstOpIsAGPR = !if(ps.has_vdst,
+                           VDstOperandIsAGPR<ps.OutOperandList>.ret, 0);
+  defvar DataOpIsAV = !if(ps.has_data,
+                          VDataOperandIsAV<ps.InOperandList>.ret, 0);
+  defvar DataOpIsAGPR = !if(ps.has_data,
+                            VDataOperandIsAGPR<ps.InOperandList>.ret, 0);
+
+  bits<1> acc = !if(ps.has_vdst,
+                    !if(DstOpIsAV, vdst{9}, DstOpIsAGPR),
+                    !if(DataOpIsAV, vdata{9}, DataOpIsAGPR));
 
   // We don't use tfe right now, and it was removed in gfx9.
   bits<1> tfe = 0;
@@ -214,11 +227,10 @@ class GlobalSaddrTable <bit is_saddr, string Name = ""> {
 // same encoding value as exec_hi, so it isn't possible to use that if
 // saddr is 32-bit (which isn't handled here yet).
 class FLAT_Load_Pseudo<
-    string opName, RegisterClass regClass, bit HasTiedOutput = 0,
+    string opName, RegisterOperand vdata_op, bit HasTiedOutput = 0,
     bit HasSaddr = 0, bit EnableSaddr = 0>
     : FLAT_Pseudo<opName, (outs), (ins), ""> {
 
-  defvar vdata_op = getLdStRegisterOperand<regClass>.ret;
   let OutOperandList = (outs vdata_op:$vdst);
   let InOperandList = !con(
     !if(EnableSaddr,
@@ -239,10 +251,9 @@ class FLAT_Load_Pseudo<
   let enabled_saddr = EnableSaddr;
 
   let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", "");
-  let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
 }
 
-multiclass FLAT_Flat_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> {
+multiclass FLAT_Flat_Load_Pseudo<string opName, RegisterOperand regClass = AVLdSt_32, bit HasTiedInput = 0> {
   def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput>,
     GlobalSaddrTable<0, opName>;
   let OtherPredicates = [HasFlatGVSMode] in
@@ -251,19 +262,19 @@ multiclass FLAT_Flat_Load_Pseudo<string opName, RegisterClass regClass, bit HasT
 }
 
 multiclass FLAT_Flat_Load_Pseudo_t16<string opName> {
-  defm "" : FLAT_Flat_Load_Pseudo<opName, VGPR_32, 1>;
+  defm "" : FLAT_Flat_Load_Pseudo<opName, AVLdSt_32, 1>;
   let True16Predicate = UseRealTrue16Insts in
-    defm _t16 : FLAT_Flat_Load_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_HI", NAME>;
+    defm _t16 : FLAT_Flat_Load_Pseudo<opName#"_t16", VGPROp_16>, True16D16Table<NAME#"_HI", NAME>;
 }
 
-class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
+class FLAT_Store_Pseudo <string opName, RegisterOperand vdataClass,
   bit HasSaddr = 0, bit EnableSaddr = 0> : FLAT_Pseudo<
   opName,
   (outs),
   !con(
     !if(EnableSaddr,
-      (ins VGPR_32:$vaddr, getLdStRegisterOperand<vdataClass>.ret:$vdata, SReg_64_XEXEC_XNULL:$saddr),
-      (ins VReg_64:$vaddr, getLdStRegisterOperand<vdataClass>.ret:$vdata)),
+      (ins VGPR_32:$vaddr, vdataClass:$vdata, SReg_64_XEXEC_XNULL:$saddr),
+      (ins VReg_64:$vaddr, vdataClass:$vdata)),
       (ins flat_offset:$offset, CPol_0:$cpol)),
   " $vaddr, $vdata"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$cpol"> {
   let mayLoad  = 0;
@@ -273,7 +284,7 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
   let enabled_saddr = EnableSaddr;
 }
 
-multiclass FLAT_Flat_Store_Pseudo<string opName, RegisterClass regClass> {
+multiclass FLAT_Flat_Store_Pseudo<string opName, RegisterOperand regClass = AVLdSt_32> {
   def "" : FLAT_Store_Pseudo<opName, regClass>,
     GlobalSaddrTable<0, opName>;
   let OtherPredicates = [HasFlatGVSMode] in
@@ -282,21 +293,22 @@ multiclass FLAT_Flat_Store_Pseudo<string opName, RegisterClass regClass> {
 }
 
 multiclass FLAT_Flat_Store_Pseudo_t16<string opName> {
-  defm "" : FLAT_Flat_Store_Pseudo<opName, VGPR_32>;
+  defm "" : FLAT_Flat_Store_Pseudo<opName, AVLdSt_32>;
 
   defvar Name16 = opName#"_t16";
   let OtherPredicates = [HasFlatGVSMode, HasTrue16BitInsts] in {
-    def _t16 : FLAT_Store_Pseudo<Name16, VGPR_16, 1>,
+    def _t16 : FLAT_Store_Pseudo<Name16, VGPROp_16, 1>,
       GlobalSaddrTable<0, Name16>,
       True16D16Table<NAME#"_D16_HI", NAME>;
-	def _SADDR_t16 : FLAT_Store_Pseudo<Name16, VGPR_16, 1, 1>,
+	def _SADDR_t16 : FLAT_Store_Pseudo<Name16, VGPROp_16, 1, 1>,
       GlobalSaddrTable<1, Name16>,
       True16D16Table<NAME#"_D16_HI_SADDR", NAME#"_SADDR">;
   }
 }
 
-multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> {
-  let is_flat_global = 1 in {
+multiclass FLAT_Global_Load_Pseudo<string opName, RegisterOperand regClass = AVLdSt_32,
+                                   bit HasTiedInput = 0> {
+  let is_flat_global = 1, SubtargetPredicate = HasFlatGlobalInsts in {
     def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1>,
       GlobalSaddrTable<0, opName>;
     def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>,
@@ -305,21 +317,21 @@ multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit Ha
 }
 
 multiclass FLAT_Global_Load_Pseudo_t16<string opName> {
-  defm "" : FLAT_Global_Load_Pseudo<opName, VGPR_32, 1>;
+  defm "" : FLAT_Global_Load_Pseudo<opName, AVLdSt_32, 1>;
 
   defvar Name16 = opName#"_t16";
   let OtherPredicates = [HasTrue16BitInsts],
       SubtargetPredicate = HasFlatGlobalInsts, is_flat_global = 1 in {
-    def _t16 : FLAT_Load_Pseudo<Name16, VGPR_16, 0, 1>,
+    def _t16 : FLAT_Load_Pseudo<Name16, VGPROp_16, 0, 1>,
       GlobalSaddrTable<0, Name16>,
       True16D16Table<NAME#"_HI", NAME>;
-    def _SADDR_t16 : FLAT_Load_Pseudo<Name16, VGPR_16, 0, 1, 1>,
+    def _SADDR_t16 : FLAT_Load_Pseudo<Name16, VGPROp_16, 0, 1, 1>,
       GlobalSaddrTable<1, Name16>,
       True16D16Table<NAME#"_HI_SADDR", NAME#"_SADDR">;
   }
 }
 
-class FLAT_Global_Load_AddTid_Pseudo <string opName, RegisterClass regClass,
+class FLAT_Global_Load_AddTid_Pseudo <string opName, RegisterOperand regClass,
   bit HasTiedOutput = 0, bit EnableSaddr = 0> : FLAT_Pseudo<
   opName,
   (outs regClass:$vdst),
@@ -335,10 +347,9 @@ class FLAT_Global_Load_AddTid_Pseudo <string opName, RegisterClass regClass,
   let enabled_saddr = EnableSaddr;
 
   let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", "");
-  let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
 }
 
-multiclass FLAT_Global_Load_AddTid_Pseudo<string opName, RegisterClass regClass,
+multiclass FLAT_Global_Load_AddTid_Pseudo<string opName, RegisterOperand regClass,
   bit HasTiedOutput = 0> {
   def "" : FLAT_Global_Load_AddTid_Pseudo<opName, regClass, HasTiedOutput>,
     GlobalSaddrTable<0, opName>;
@@ -346,8 +357,8 @@ multiclass FLAT_Global_Load_AddTid_Pseudo<string opName, RegisterClass regClass,
     GlobalSaddrTable<1, opName>;
 }
 
-multiclass FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> {
-  let is_flat_global = 1 in {
+multiclass FLAT_Global_Store_Pseudo<string opName, RegisterOperand regClass = AVLdSt_32> {
+  let is_flat_global = 1, SubtargetPredicate = HasFlatGlobalInsts in {
     def "" : FLAT_Store_Pseudo<opName, regClass, 1>,
       GlobalSaddrTable<0, opName>;
     def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1>,
@@ -356,15 +367,15 @@ multiclass FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> {
 }
 
 multiclass FLAT_Global_Store_Pseudo_t16<string opName> {
-  defm "" : FLAT_Global_Store_Pseudo<opName, VGPR_32>;
+  defm "" : FLAT_Global_Store_Pseudo<opName, AVLdSt_32>;
 
   defvar Name16 = opName#"_t16";
   let OtherPredicates = [HasTrue16BitInsts],
       SubtargetPredicate = HasFlatGlobalInsts, is_flat_global = 1 in {
-    def _t16 : FLAT_Store_Pseudo<Name16, VGPR_16, 1>,
+    def _t16 : FLAT_Store_Pseudo<Name16, VGPROp_16, 1>,
       GlobalSaddrTable<0, Name16>,
       True16D16Table<NAME#"_D16_HI", NAME>;
-    def _SADDR_t16 : FLAT_Store_Pseudo<Name16, VGPR_16, 1, 1>,
+    def _SADDR_t16 : FLAT_Store_Pseudo<Name16, VGPROp_16, 1, 1>,
       GlobalSaddrTable<1, Name16>,
       True16D16Table<NAME#"_D16_HI_SADDR", NAME#"_SADDR">;
   }
@@ -435,7 +446,7 @@ multiclass FLAT_Global_STORE_LDS_Pseudo<string opName> {
     GlobalSaddrTable<1, opName>;
 }
 
-class FLAT_Global_Store_AddTid_Pseudo <string opName, RegisterClass vdataClass,
+class FLAT_Global_Store_AddTid_Pseudo <string opName, RegisterOperand vdataClass,
   bit EnableSaddr = 0> : FLAT_Pseudo<
   opName,
   (outs),
@@ -451,7 +462,7 @@ class FLAT_Global_Store_AddTid_Pseudo <string opName, RegisterClass vdataClass,
   let enabled_saddr = EnableSaddr;
 }
 
-multiclass FLAT_Global_Store_AddTid_Pseudo<string opName, RegisterClass regClass> {
+multiclass FLAT_Global_Store_AddTid_Pseudo<string opName, RegisterOperand regClass> {
   def "" : FLAT_Global_Store_AddTid_Pseudo<opName, regClass>,
     GlobalSaddrTable<0, opName>;
   def _SADDR : FLAT_Global_Store_AddTid_Pseudo<opName, regClass, 1>,
@@ -539,14 +550,14 @@ class FlatScratchInst <string sv_op, string mode> {
   string Mode = mode;
 }
 
-class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass,
+class FLAT_Scratch_Load_Pseudo <string opName, RegisterOperand regClass = AVLdSt_32,
   bit HasTiedOutput = 0,
   bit EnableSaddr = 0,
   bit EnableSVE = 0,
   bit EnableVaddr = !or(EnableSVE, !not(EnableSaddr))>
   : FLAT_Pseudo<
   opName,
-  (outs getLdStRegisterOperand<regClass>.ret:$vdst),
+  (outs regClass:$vdst),
   !con(
     !if(EnableSVE,
         (ins VGPR_32:$vaddr, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset),
@@ -555,7 +566,7 @@ class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass,
           !if(EnableVaddr,
             (ins VGPR_32:$vaddr, flat_offset:$offset),
             (ins flat_offset:$offset)))),
-     !if(HasTiedOutput, (ins CPol:$cpol, getLdStRegisterOperand<regClass>.ret:$vdst_in),
+     !if(HasTiedOutput, (ins CPol:$cpol, regClass:$vdst_in),
                         (ins CPol_0:$cpol))),
   " $vdst, "#!if(EnableVaddr, "$vaddr, ", "off, ")#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol"> {
   let is_flat_scratch = 1;
@@ -568,13 +579,11 @@ class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass,
   let sve = EnableVaddr;
 
   let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", "");
-  let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
 }
 
-class FLAT_Scratch_Store_Pseudo <string opName, RegisterClass vdataClass, bit EnableSaddr = 0,
+class FLAT_Scratch_Store_Pseudo <string opName, RegisterOperand vdata_op, bit EnableSaddr = 0,
   bit EnableSVE = 0,
-  bit EnableVaddr = !or(EnableSVE, !not(EnableSaddr)),
-  RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret> : FLAT_Pseudo<
+  bit EnableVaddr = !or(EnableSVE, !not(EnableSaddr))> : FLAT_Pseudo<
   opName,
   (outs),
   !if(EnableSVE,
@@ -596,7 +605,8 @@ class FLAT_Scratch_Store_Pseudo <string opName, RegisterClass vdataClass, bit En
   let sve = EnableVaddr;
 }
 
-multiclass FLAT_Scratch_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedOutput = 0> {
+multiclass FLAT_Scratch_Load_Pseudo<string opName, RegisterOperand regClass = AVLdSt_32,
+                                    bit HasTiedOutput = 0> {
   def "" : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput>,
            FlatScratchInst<opName, "SV">;
   def _SADDR : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput, 1>,
@@ -612,29 +622,29 @@ multiclass FLAT_Scratch_Load_Pseudo<string opName, RegisterClass regClass, bit H
 }
 
 multiclass FLAT_Scratch_Load_Pseudo_t16<string opName> {
-  defm "" : FLAT_Scratch_Load_Pseudo<opName, VGPR_32, 1>;
+  defm "" : FLAT_Scratch_Load_Pseudo<opName, AVLdSt_32, 1>;
 
   defvar Name16 = opName#"_t16";
   let OtherPredicates = [HasTrue16BitInsts], is_flat_scratch = 1 in {
-    def _t16 : FLAT_Scratch_Load_Pseudo<Name16, VGPR_16, 0>,
+    def _t16 : FLAT_Scratch_Load_Pseudo<Name16, VGPROp_16, 0>,
                FlatScratchInst<Name16, "SV">,
                True16D16Table<NAME#"_HI", NAME>;
-    def _SADDR_t16 : FLAT_Scratch_Load_Pseudo<Name16, VGPR_16, 0, 1>,
+    def _SADDR_t16 : FLAT_Scratch_Load_Pseudo<Name16, VGPROp_16, 0, 1>,
                      FlatScratchInst<Name16, "SS">,
                      True16D16Table<NAME#"_HI_SADDR", NAME#"_SADDR">;
     let SubtargetPredicate = HasFlatScratchSVSMode in
-    def _SVS_t16 : FLAT_Scratch_Load_Pseudo<Name16, VGPR_16, 0, 1, 1>,
+    def _SVS_t16 : FLAT_Scratch_Load_Pseudo<Name16, VGPROp_16, 0, 1, 1>,
                    FlatScratchInst<Name16, "SVS">,
                    True16D16Table<NAME#"_HI_SVS", NAME#"_SVS">;
 
     let SubtargetPredicate = HasFlatScratchSTMode in
-    def _ST_t16  : FLAT_Scratch_Load_Pseudo<Name16, VGPR_16, 0, 0, 0, 0>,
+    def _ST_t16  : FLAT_Scratch_Load_Pseudo<Name16, VGPROp_16, 0, 0, 0, 0>,
                    FlatScratchInst<Name16, "ST">,
                    True16D16Table<NAME#"_HI_ST", NAME#"_ST">;
   }
 }
 
-multiclass FLAT_Scratch_Store_Pseudo<string opName, RegisterClass regClass> {
+multiclass FLAT_Scratch_Store_Pseudo<string opName, RegisterOperand regClass = AVLdSt_32> {
   def "" : FLAT_Scratch_Store_Pseudo<opName, regClass>,
            FlatScratchInst<opName, "SV">;
   def _SADDR : FLAT_Scratch_Store_Pseudo<opName, regClass, 1>,
@@ -650,24 +660,24 @@ multiclass FLAT_Scratch_Store_Pseudo<string opName, RegisterClass regClass> {
 }
 
 multiclass FLAT_Scratch_Store_Pseudo_t16<string opName> {
-  defm "" : FLAT_Scratch_Store_Pseudo<opName, VGPR_32>;
+  defm "" : FLAT_Scratch_Store_Pseudo<opName, AVLdSt_32>;
 
   defvar Name16 = opName#"_t16";
   let OtherPredicates = [HasTrue16BitInsts], is_flat_scratch = 1 in {
-    def _t16 : FLAT_Scratch_Store_Pseudo<Name16, VGPR_16>,
+    def _t16 : FLAT_Scratch_Store_Pseudo<Name16, VGPROp_16>,
                FlatScratchInst<Name16, "SV">,
                True16D16Table<NAME#"_D16_HI", NAME>;
-    def _SADDR_t16 : FLAT_Scratch_Store_Pseudo<Name16, VGPR_16, 1>,
+    def _SADDR_t16 : FLAT_Scratch_Store_Pseudo<Name16, VGPROp_16, 1>,
                    FlatScratchInst<Name16, "SS">,
                    True16D16Table<NAME#"_D16_HI_SADDR", NAME#"_SADDR">;
 
     let SubtargetPredicate = HasFlatScratchSVSMode in
-    def _SVS_t16 : FLAT_Scratch_Store_Pseudo<Name16, VGPR_16, 1, 1>,
+    def _SVS_t16 : FLAT_Scratch_Store_Pseudo<Name16, VGPROp_16, 1, 1>,
                    FlatScratchInst<Name16, "SVS">,
                    True16D16Table<NAME#"_D16_HI_SVS", NAME#"_SVS">;
 
     let SubtargetPredicate = HasFlatScratchSTMode in
-    def _ST_t16  : FLAT_Scratch_Store_Pseudo<Name16, VGPR_16, 0, 0, 0>,
+    def _ST_t16  : FLAT_Scratch_Store_Pseudo<Name16, VGPROp_16, 0, 0, 0>,
                    FlatScratchInst<Name16, "ST">,
                    True16D16Table<NAME#"_D16_HI_ST", NAME#"_ST">;
   }
@@ -741,11 +751,10 @@ class FLAT_AtomicRet_Pseudo<string opName, dag outs, dag ins,
 
 multiclass FLAT_Atomic_Pseudo_NO_RTN<
   string opName,
-  RegisterClass vdst_rc,
+  RegisterOperand vdst_op,
   ValueType vt,
   ValueType data_vt = vt,
-  RegisterClass data_rc = vdst_rc,
-  RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> {
+  RegisterOperand data_op = vdst_op> {
   def "" : FLAT_AtomicNoRet_Pseudo <opName,
     (outs),
     (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_0:$cpol),
@@ -770,15 +779,17 @@ multiclass FLAT_Atomic_Pseudo_NO_RTN<
 
 multiclass FLAT_Atomic_Pseudo_RTN<
   string opName,
-  RegisterClass vdst_rc,
+  RegisterOperand vdst_op,
   ValueType vt,
   ValueType data_vt = vt,
-  RegisterClass data_rc = vdst_rc,
-  RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret,
-  RegisterOperand vdst_op = getLdStRegisterOperand<vdst_rc>.ret> {
+  RegisterOperand data_op = vdst_op> {
+
+  defvar vdst_op_vgpr = getEquivalentVGPROperand<vdst_op>.ret;
+  defvar data_op_vgpr = getEquivalentVGPROperand<data_op>.ret;
+
   def _RTN : FLAT_AtomicRet_Pseudo <opName,
-    (outs vdst_op:$vdst),
-    (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
+    (outs vdst_op_vgpr:$vdst),
+    (ins VReg_64:$vaddr, data_op_vgpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
     " $vdst, $vaddr, $vdata$offset$cpol">,
     GlobalSaddrTable<0, opName#"_rtn"> {
     let FPAtomic = data_vt.isFP;
@@ -786,8 +797,8 @@ multiclass FLAT_Atomic_Pseudo_RTN<
   }
 
   def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName,
-    (outs vdst_op:$vdst),
-      (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_GLC1:$cpol),
+    (outs vdst_op_vgpr:$vdst),
+      (ins VGPR_32:$vaddr, data_op_vgpr:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_GLC1:$cpol),
     " $vdst, $vaddr, $vdata, $saddr$offset$cpol">,
     GlobalSaddrTable<1, opName#"_rtn"> {
     let OtherPredicates = [HasFlatGVSMode];
@@ -797,26 +808,37 @@ multiclass FLAT_Atomic_Pseudo_RTN<
     let FPAtomic = data_vt.isFP;
     let AddedComplexity = -1; // Prefer global atomics if available
   }
+
+  defvar vdst_op_agpr = getEquivalentAGPROperand<vdst_op>.ret;
+  defvar data_op_agpr = getEquivalentAGPROperand<data_op>.ret;
+
+  def _RTN_agpr : FLAT_AtomicRet_Pseudo <opName,
+    (outs vdst_op_agpr:$vdst),
+    (ins VReg_64:$vaddr, data_op_agpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
+    " $vdst, $vaddr, $vdata$offset$cpol">,
+    GlobalSaddrTable<0, opName#"_rtn_agpr"> {
+    let FPAtomic = data_vt.isFP;
+    let AddedComplexity = -1; // Prefer global atomics if available
+  }
+  // No saddr agpr form. HasFlatGVSMode targets do not have AGPRs.
 }
 
 multiclass FLAT_Atomic_Pseudo<
   string opName,
-  RegisterClass vdst_rc,
+  RegisterOperand vdst_op,
   ValueType vt,
   ValueType data_vt = vt,
-  RegisterClass data_rc = vdst_rc,
-  RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> {
-  defm "" : FLAT_Atomic_Pseudo_NO_RTN<opName, vdst_rc, vt, data_vt, data_rc, data_op>;
-  defm "" : FLAT_Atomic_Pseudo_RTN<opName, vdst_rc, vt, data_vt, data_rc, data_op>;
+  RegisterOperand data_op = vdst_op> {
+  defm "" : FLAT_Atomic_Pseudo_NO_RTN<opName, vdst_op, vt, data_vt, data_op>;
+  defm "" : FLAT_Atomic_Pseudo_RTN<opName, vdst_op, vt, data_vt, data_op>;
 }
 
 multiclass FLAT_Global_Atomic_Pseudo_NO_RTN<
   string opName,
-  RegisterClass vdst_rc,
+  RegisterOperand vdst_op,
   ValueType vt,
   ValueType data_vt = vt,
-  RegisterClass data_rc = vdst_rc,
-  RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> {
+  RegisterOperand data_op = vdst_op> {
 
   let is_flat_global = 1 in {
     def "" : FLAT_AtomicNoRet_Pseudo <opName,
@@ -842,17 +864,18 @@ multiclass FLAT_Global_Atomic_Pseudo_NO_RTN<
 
 multiclass FLAT_Global_Atomic_Pseudo_RTN<
   string opName,
-  RegisterClass vdst_rc,
+  RegisterOperand vdst_op,
   ValueType vt,
   ValueType data_vt = vt,
-  RegisterClass data_rc = vdst_rc,
-  RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret,
-  RegisterOperand vdst_op = getLdStRegisterOperand<vdst_rc>.ret> {
+  RegisterOperand data_op = vdst_op> {
+
+  defvar vdst_op_vgpr = getEquivalentVGPROperand<vdst_op>.ret;
+  defvar data_op_vgpr = getEquivalentVGPROperand<data_op>.ret;
 
   let is_flat_global = 1 in {
     def _RTN : FLAT_AtomicRet_Pseudo <opName,
-      (outs vdst_op:$vdst),
-        (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
+      (outs vdst_op_vgpr:$vdst),
+        (ins VReg_64:$vaddr, data_op_vgpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
       " $vdst, $vaddr, $vdata, off$offset$cpol">,
       GlobalSaddrTable<0, opName#"_rtn"> {
       let has_saddr = 1;
@@ -860,23 +883,47 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN<
     }
 
     def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName,
-      (outs vdst_op:$vdst),
-        (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64_XEXEC_XNULL:$saddr, flat_offset:$offset, CPol_GLC1:$cpol),
+      (outs vdst_op_vgpr:$vdst),
+        (ins VGPR_32:$vaddr, data_op_vgpr:$vdata, SReg_64_XEXEC_XNULL:$saddr, flat_offset:$offset, CPol_GLC1:$cpol),
       " $vdst, $vaddr, $vdata, $saddr$offset$cpol">,
       GlobalSaddrTable<1, opName#"_rtn"> {
        let has_saddr = 1;
        let enabled_saddr = 1;
        let FPAtomic = data_vt.isFP;
     }
+
+    defvar vdst_op_agpr = getEquivalentAGPROperand<vdst_op>.ret;
+    defvar data_op_agpr = getEquivalentAGPROperand<data_op>.ret;
+
+    let SubtargetPredicate = isGFX90APlus in {
+      def _RTN_agpr : FLAT_AtomicRet_Pseudo <opName,
+        (outs vdst_op_agpr:$vdst),
+        (ins VReg_64:$vaddr, data_op_agpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
+        " $vdst, $vaddr, $vdata, off$offset$cpol">,
+        GlobalSaddrTable<0, opName#"_rtn_agpr"> {
+        let has_saddr = 1;
+        let FPAtomic = data_vt.isFP;
+      }
+
+      def _SADDR_RTN_agpr : FLAT_AtomicRet_Pseudo <opName,
+        (outs vdst_op_agpr:$vdst),
+        (ins VGPR_32:$vaddr, data_op_agpr:$vdata, SReg_64_XEXEC_XNULL:$saddr, flat_offset:$offset, CPol_GLC1:$cpol),
+        " $vdst, $vaddr, $vdata, $saddr$offset$cpol">,
+        GlobalSaddrTable<1, opName#"_rtn_agpr"> {
+         let has_saddr = 1;
+         let enabled_saddr = 1;
+         let FPAtomic = data_vt.isFP;
+      }
+    }
   }
 }
 
 multiclass FLAT_Global_Atomic_Pseudo<
   string opName,
-  RegisterClass vdst_rc,
+  RegisterOperand vdst_rc,
   ValueType vt,
   ValueType data_vt = vt,
-  RegisterClass data_rc = vdst_rc> {
+  RegisterOperand data_rc = vdst_rc> {
   defm "" : FLAT_Global_Atomic_Pseudo_NO_RTN<opName, vdst_rc, vt, data_vt, data_rc>;
   defm "" : FLAT_Global_Atomic_Pseudo_RTN<opName, vdst_rc, vt, data_vt, data_rc>;
 }
@@ -885,119 +932,119 @@ multiclass FLAT_Global_Atomic_Pseudo<
 // Flat Instructions
 //===----------------------------------------------------------------------===//
 
-defm FLAT_LOAD_UBYTE    : FLAT_Flat_Load_Pseudo <"flat_load_ubyte", VGPR_32>;
-defm FLAT_LOAD_SBYTE    : FLAT_Flat_Load_Pseudo <"flat_load_sbyte", VGPR_32>;
-defm FLAT_LOAD_USHORT   : FLAT_Flat_Load_Pseudo <"flat_load_ushort", VGPR_32>;
-defm FLAT_LOAD_SSHORT   : FLAT_Flat_Load_Pseudo <"flat_load_sshort", VGPR_32>;
-defm FLAT_LOAD_DWORD    : FLAT_Flat_Load_Pseudo <"flat_load_dword", VGPR_32>;
-defm FLAT_LOAD_DWORDX2  : FLAT_Flat_Load_Pseudo <"flat_load_dwordx2", VReg_64>;
-defm FLAT_LOAD_DWORDX4  : FLAT_Flat_Load_Pseudo <"flat_load_dwordx4", VReg_128>;
-defm FLAT_LOAD_DWORDX3  : FLAT_Flat_Load_Pseudo <"flat_load_dwordx3", VReg_96>;
+defm FLAT_LOAD_UBYTE    : FLAT_Flat_Load_Pseudo <"flat_load_ubyte">;
+defm FLAT_LOAD_SBYTE    : FLAT_Flat_Load_Pseudo <"flat_load_sbyte">;
+defm FLAT_LOAD_USHORT   : FLAT_Flat_Load_Pseudo <"flat_load_ushort">;
+defm FLAT_LOAD_SSHORT   : FLAT_Flat_Load_Pseudo <"flat_load_sshort">;
+defm FLAT_LOAD_DWORD    : FLAT_Flat_Load_Pseudo <"flat_load_dword">;
+defm FLAT_LOAD_DWORDX2  : FLAT_Flat_Load_Pseudo <"flat_load_dwordx2", AVLdSt_64>;
+defm FLAT_LOAD_DWORDX4  : FLAT_Flat_Load_Pseudo <"flat_load_dwordx4", AVLdSt_128>;
+defm FLAT_LOAD_DWORDX3  : FLAT_Flat_Load_Pseudo <"flat_load_dwordx3", AVLdSt_96>;
 
-defm FLAT_STORE_DWORD   : FLAT_Flat_Store_Pseudo <"flat_store_dword", VGPR_32>;
-defm FLAT_STORE_DWORDX2 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx2", VReg_64>;
-defm FLAT_STORE_DWORDX4 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx4", VReg_128>;
-defm FLAT_STORE_DWORDX3 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx3", VReg_96>;
+defm FLAT_STORE_DWORD   : FLAT_Flat_Store_Pseudo <"flat_store_dword">;
+defm FLAT_STORE_DWORDX2 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx2", AVLdSt_64>;
+defm FLAT_STORE_DWORDX4 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx4", AVLdSt_128>;
+defm FLAT_STORE_DWORDX3 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx3", AVLdSt_96>;
 
 let SubtargetPredicate = HasD16LoadStore in {
 let TiedSourceNotRead = 1 in {
-defm FLAT_LOAD_UBYTE_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32, 1>;
+defm FLAT_LOAD_UBYTE_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_ubyte_d16_hi", AVLdSt_32, 1>;
 defm FLAT_LOAD_UBYTE_D16    : FLAT_Flat_Load_Pseudo_t16 <"flat_load_ubyte_d16">;
-defm FLAT_LOAD_SBYTE_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32, 1>;
+defm FLAT_LOAD_SBYTE_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_sbyte_d16_hi", AVLdSt_32, 1>;
 defm FLAT_LOAD_SBYTE_D16    : FLAT_Flat_Load_Pseudo_t16 <"flat_load_sbyte_d16">;
-defm FLAT_LOAD_SHORT_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32, 1>;
+defm FLAT_LOAD_SHORT_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_short_d16_hi", AVLdSt_32, 1>;
 defm FLAT_LOAD_SHORT_D16    : FLAT_Flat_Load_Pseudo_t16 <"flat_load_short_d16">;
 }
 
-defm FLAT_STORE_BYTE_D16_HI  : FLAT_Flat_Store_Pseudo <"flat_store_byte_d16_hi", VGPR_32>;
-defm FLAT_STORE_SHORT_D16_HI : FLAT_Flat_Store_Pseudo <"flat_store_short_d16_hi", VGPR_32>;
+defm FLAT_STORE_BYTE_D16_HI  : FLAT_Flat_Store_Pseudo <"flat_store_byte_d16_hi">;
+defm FLAT_STORE_SHORT_D16_HI : FLAT_Flat_Store_Pseudo <"flat_store_short_d16_hi">;
 }
 
 defm FLAT_STORE_BYTE   : FLAT_Flat_Store_Pseudo_t16 <"flat_store_byte">;
 defm FLAT_STORE_SHORT  : FLAT_Flat_Store_Pseudo_t16 <"flat_store_short">;
 
 defm FLAT_ATOMIC_CMPSWAP    : FLAT_Atomic_Pseudo <"flat_atomic_cmpswap",
-                                VGPR_32, i32, v2i32, VReg_64>;
+                                AVLdSt_32, i32, v2i32, AVLdSt_64>;
 
 defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_Atomic_Pseudo <"flat_atomic_cmpswap_x2",
-                                VReg_64, i64, v2i64, VReg_128>;
+                                AVLdSt_64, i64, v2i64, AVLdSt_128>;
 
 defm FLAT_ATOMIC_SWAP       : FLAT_Atomic_Pseudo <"flat_atomic_swap",
-                                VGPR_32, i32>;
+                                AVLdSt_32, i32>;
 
 defm FLAT_ATOMIC_SWAP_X2    : FLAT_Atomic_Pseudo <"flat_atomic_swap_x2",
-                                VReg_64, i64>;
+                                AVLdSt_64, i64>;
 
 defm FLAT_ATOMIC_ADD        : FLAT_Atomic_Pseudo <"flat_atomic_add",
-                                VGPR_32, i32>;
+                                AVLdSt_32, i32>;
 
 defm FLAT_ATOMIC_SUB        : FLAT_Atomic_Pseudo <"flat_atomic_sub",
-                                VGPR_32, i32>;
+                                AVLdSt_32, i32>;
 
 defm FLAT_ATOMIC_SMIN       : FLAT_Atomic_Pseudo <"flat_atomic_smin",
-                                VGPR_32, i32>;
+                                AVLdSt_32, i32>;
 
 defm FLAT_ATOMIC_UMIN       : FLAT_Atomic_Pseudo <"flat_atomic_umin",
-                                VGPR_32, i32>;
+                                AVLdSt_32, i32>;
 
 defm FLAT_ATOMIC_SMAX       : FLAT_Atomic_Pseudo <"flat_atomic_smax",
-                                VGPR_32, i32>;
+                                AVLdSt_32, i32>;
 
 defm FLAT_ATOMIC_UMAX       : FLAT_Atomic_Pseudo <"flat_atomic_umax",
-                                VGPR_32, i32>;
+                                AVLdSt_32, i32>;
 
 defm FLAT_ATOMIC_AND        : FLAT_Atomic_Pseudo <"flat_atomic_and",
-                                VGPR_32, i32>;
+                                AVLdSt_32, i32>;
 
 defm FLAT_ATOMIC_OR         : FLAT_Atomic_Pseudo <"flat_atomic_or",
-                                VGPR_32, i32>;
+                                AVLdSt_32, i32>;
 
 defm FLAT_ATOMIC_XOR        : FLAT_Atomic_Pseudo <"flat_atomic_xor",
-                                VGPR_32, i32>;
+                                AVLdSt_32, i32>;
 
 defm FLAT_ATOMIC_INC        : FLAT_Atomic_Pseudo <"flat_atomic_inc",
-                                VGPR_32, i32>;
+                                AVLdSt_32, i32>;
 
 defm FLAT_ATOMIC_DEC        : FLAT_Atomic_Pseudo <"flat_atomic_dec",
-                                VGPR_32, i32>;
+                                AVLdSt_32, i32>;
 
 defm FLAT_ATOMIC_ADD_X2     : FLAT_Atomic_Pseudo <"flat_atomic_add_x2",
-                                VReg_64, i64>;
+                                AVLdSt_64, i64>;
 
 defm FLAT_ATOMIC_SUB_X2     : FLAT_Atomic_Pseudo <"flat_atomic_sub_x2",
-                                VReg_64, i64>;
+                                AVLdSt_64, i64>;
 
 defm FLAT_ATOMIC_SMIN_X2    : FLAT_Atomic_Pseudo <"flat_atomic_smin_x2",
-                                VReg_64, i64>;
+                                AVLdSt_64, i64>;
 
 defm FLAT_ATOMIC_UMIN_X2    : FLAT_Atomic_Pseudo <"flat_atomic_umin_x2",
-                                VReg_64, i64>;
+                                AVLdSt_64, i64>;
 
 defm FLAT_ATOMIC_SMAX_X2    : FLAT_Atomic_Pseudo <"flat_atomic_smax_x2",
-                                VReg_64, i64>;
+                                AVLdSt_64, i64>;
 
 defm FLAT_ATOMIC_UMAX_X2    : FLAT_Atomic_Pseudo <"flat_atomic_umax_x2",
-                                VReg_64, i64>;
+                                AVLdSt_64, i64>;
 
 defm FLAT_ATOMIC_AND_X2     : FLAT_Atomic_Pseudo <"flat_atomic_and_x2",
-                                VReg_64, i64>;
+                                AVLdSt_64, i64>;
 
 defm FLAT_ATOMIC_OR_X2      : FLAT_Atomic_Pseudo <"flat_atomic_or_x2",
-                                VReg_64, i64>;
+                                AVLdSt_64, i64>;
 
 defm FLAT_ATOMIC_XOR_X2     : FLAT_Atomic_Pseudo <"flat_atomic_xor_x2",
-                                VReg_64, i64>;
+                                AVLdSt_64, i64>;
 
 defm FLAT_ATOMIC_INC_X2     : FLAT_Atomic_Pseudo <"flat_atomic_inc_x2",
-                                VReg_64, i64>;
+                                AVLdSt_64, i64>;
 
 defm FLAT_ATOMIC_DEC_X2     : FLAT_Atomic_Pseudo <"flat_atomic_dec_x2",
-                                VReg_64, i64>;
+                                AVLdSt_64, i64>;
 
 // GFX7-, GFX10-only flat instructions.
 let SubtargetPredicate = isGFX7GFX10 in {
 defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fcmpswap_x2",
-                                VReg_64, f64, v2f64, VReg_128>;
+                                AVLdSt_64, f64, v2f64, AVLdSt_128>;
 } // End SubtargetPredicate = isGFX7GFX10
 
 
@@ -1005,169 +1052,173 @@ defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fcmpswap_x2",
 // choose this as the canonical name.
 let SubtargetPredicate = HasAtomicFMinFMaxF64FlatInsts in {
 defm FLAT_ATOMIC_MIN_F64 : FLAT_Atomic_Pseudo <"flat_atomic_min_f64",
-                                               VReg_64, f64>;
+                                               AVLdSt_64, f64>;
 
 defm FLAT_ATOMIC_MAX_F64 : FLAT_Atomic_Pseudo <"flat_atomic_max_f64",
-                                                VReg_64, f64>;
+                                                AVLdSt_64, f64>;
 }
 
 let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts in {
-defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", VReg_64, f64>;
-defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64>;
+defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", AVLdSt_64, f64>;
+defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", AVLdSt_64, f64>;
 }
 
 let SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst  in {
-  defm FLAT_ATOMIC_ADD_F64   : FLAT_Atomic_Pseudo<"flat_atomic_add_f64", VReg_64, f64>;
-  defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_add_f64", VReg_64, f64>;
+  defm FLAT_ATOMIC_ADD_F64   : FLAT_Atomic_Pseudo<"flat_atomic_add_f64", AVLdSt_64, f64>;
+  defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_add_f64", AVLdSt_64, f64>;
 } // End SubtargetPredicate = HasFlatBufferGlobalAtomicFaddF64Inst
 
 let SubtargetPredicate = HasAtomicFlatPkAdd16Insts in {
-  defm FLAT_ATOMIC_PK_ADD_F16    : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_f16",  VGPR_32, v2f16>;
+  defm FLAT_ATOMIC_PK_ADD_F16    : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_f16", AVLdSt_32, v2f16>;
   let FPAtomic = 1 in
-    defm FLAT_ATOMIC_PK_ADD_BF16   : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_bf16", VGPR_32, v2i16>;
+    defm FLAT_ATOMIC_PK_ADD_BF16   : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_bf16", AVLdSt_32, v2i16>;
 } // End SubtargetPredicate = HasAtomicFlatPkAdd16Insts
 
 let SubtargetPredicate = HasAtomicGlobalPkAddBF16Inst, FPAtomic = 1 in
-  defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Atomic_Pseudo<"global_atomic_pk_add_bf16", VGPR_32, v2i16>;
+  defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Atomic_Pseudo<"global_atomic_pk_add_bf16", AVLdSt_32, v2i16>;
 
 // GFX7-, GFX10-, GFX11-only flat instructions.
 let SubtargetPredicate = isGFX7GFX10GFX11 in {
 
 defm FLAT_ATOMIC_FCMPSWAP    : FLAT_Atomic_Pseudo <"flat_atomic_fcmpswap",
-                                VGPR_32, f32, v2f32, VReg_64>;
+                                AVLdSt_32, f32, v2f32, AVLdSt_64>;
 
 defm FLAT_ATOMIC_FMIN        : FLAT_Atomic_Pseudo <"flat_atomic_fmin",
-                                VGPR_32, f32>;
+                                AVLdSt_32, f32>;
 
 defm FLAT_ATOMIC_FMAX        : FLAT_Atomic_Pseudo <"flat_atomic_fmax",
-                                VGPR_32, f32>;
+                                AVLdSt_32, f32>;
 
 } // End SubtargetPredicate = isGFX7GFX10GFX11
 
 // GFX942-, GFX11-only flat instructions.
 let SubtargetPredicate = HasFlatAtomicFaddF32Inst in {
-  defm FLAT_ATOMIC_ADD_F32       : FLAT_Atomic_Pseudo<"flat_atomic_add_f32",     VGPR_32, f32>;
+  defm FLAT_ATOMIC_ADD_F32       : FLAT_Atomic_Pseudo<"flat_atomic_add_f32",     AVLdSt_32, f32>;
 } // End SubtargetPredicate = HasFlatAtomicFaddF32Inst
 
 let SubtargetPredicate = isGFX12Plus in {
-  defm FLAT_ATOMIC_CSUB_U32 : FLAT_Atomic_Pseudo <"flat_atomic_csub_u32", VGPR_32, i32>;
-  defm FLAT_ATOMIC_COND_SUB_U32 : FLAT_Atomic_Pseudo <"flat_atomic_cond_sub_u32", VGPR_32, i32>;
-} // End SubtargetPredicate = isGFX12Plus
+  defm FLAT_ATOMIC_CSUB_U32 : FLAT_Atomic_Pseudo <"flat_atomic_csub_u32", VGPROp_32, i32>;
+  defm FLAT_ATOMIC_COND_SUB_U32 : FLAT_Atomic_Pseudo_RTN<"flat_atomic_cond_sub_u32", VGPROp_32, i32>;
+}
+
+let SubtargetPredicate = HasAtomicCSubNoRtnInsts in {
+  defm FLAT_ATOMIC_COND_SUB_U32 : FLAT_Atomic_Pseudo_NO_RTN<"flat_atomic_cond_sub_u32", VGPROp_32, i32>;
+}
 
-defm GLOBAL_LOAD_UBYTE    : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>;
-defm GLOBAL_LOAD_SBYTE    : FLAT_Global_Load_Pseudo <"global_load_sbyte", VGPR_32>;
-defm GLOBAL_LOAD_USHORT   : FLAT_Global_Load_Pseudo <"global_load_ushort", VGPR_32>;
-defm GLOBAL_LOAD_SSHORT   : FLAT_Global_Load_Pseudo <"global_load_sshort", VGPR_32>;
-defm GLOBAL_LOAD_DWORD    : FLAT_Global_Load_Pseudo <"global_load_dword", VGPR_32>;
-defm GLOBAL_LOAD_DWORDX2  : FLAT_Global_Load_Pseudo <"global_load_dwordx2", VReg_64>;
-defm GLOBAL_LOAD_DWORDX3  : FLAT_Global_Load_Pseudo <"global_load_dwordx3", VReg_96>;
-defm GLOBAL_LOAD_DWORDX4  : FLAT_Global_Load_Pseudo <"global_load_dwordx4", VReg_128>;
+defm GLOBAL_LOAD_UBYTE    : FLAT_Global_Load_Pseudo <"global_load_ubyte">;
+defm GLOBAL_LOAD_SBYTE    : FLAT_Global_Load_Pseudo <"global_load_sbyte">;
+defm GLOBAL_LOAD_USHORT   : FLAT_Global_Load_Pseudo <"global_load_ushort">;
+defm GLOBAL_LOAD_SSHORT   : FLAT_Global_Load_Pseudo <"global_load_sshort">;
+defm GLOBAL_LOAD_DWORD    : FLAT_Global_Load_Pseudo <"global_load_dword">;
+defm GLOBAL_LOAD_DWORDX2  : FLAT_Global_Load_Pseudo <"global_load_dwordx2", AVLdSt_64>;
+defm GLOBAL_LOAD_DWORDX3  : FLAT_Global_Load_Pseudo <"global_load_dwordx3", AVLdSt_96>;
+defm GLOBAL_LOAD_DWORDX4  : FLAT_Global_Load_Pseudo <"global_load_dwordx4", AVLdSt_128>;
 
 let TiedSourceNotRead = 1 in {
-defm GLOBAL_LOAD_SBYTE_D16_HI : FLAT_Global_Load_Pseudo <"global_load_sbyte_d16_hi", VGPR_32, 1>;
-defm GLOBAL_LOAD_SHORT_D16_HI : FLAT_Global_Load_Pseudo <"global_load_short_d16_hi", VGPR_32, 1>;
-defm GLOBAL_LOAD_UBYTE_D16_HI : FLAT_Global_Load_Pseudo <"global_load_ubyte_d16_hi", VGPR_32, 1>;
+defm GLOBAL_LOAD_SBYTE_D16_HI : FLAT_Global_Load_Pseudo <"global_load_sbyte_d16_hi", AVLdSt_32, 1>;
+defm GLOBAL_LOAD_SHORT_D16_HI : FLAT_Global_Load_Pseudo <"global_load_short_d16_hi", AVLdSt_32, 1>;
+defm GLOBAL_LOAD_UBYTE_D16_HI : FLAT_Global_Load_Pseudo <"global_load_ubyte_d16_hi", AVLdSt_32, 1>;
 defm GLOBAL_LOAD_SBYTE_D16    : FLAT_Global_Load_Pseudo_t16 <"global_load_sbyte_d16">;
 defm GLOBAL_LOAD_SHORT_D16    : FLAT_Global_Load_Pseudo_t16 <"global_load_short_d16">;
 defm GLOBAL_LOAD_UBYTE_D16    : FLAT_Global_Load_Pseudo_t16 <"global_load_ubyte_d16">;
 }
 
-defm GLOBAL_STORE_BYTE_D16_HI  : FLAT_Global_Store_Pseudo <"global_store_byte_d16_hi", VGPR_32>;
-defm GLOBAL_STORE_SHORT_D16_HI : FLAT_Global_Store_Pseudo <"global_store_short_d16_hi", VGPR_32>;
+defm GLOBAL_STORE_BYTE_D16_HI  : FLAT_Global_Store_Pseudo <"global_store_byte_d16_hi">;
+defm GLOBAL_STORE_SHORT_D16_HI : FLAT_Global_Store_Pseudo <"global_store_short_d16_hi">;
 
 let OtherPredicates = [HasGFX10_BEncoding] in
-defm GLOBAL_LOAD_DWORD_ADDTID : FLAT_Global_Load_AddTid_Pseudo <"global_load_dword_addtid", VGPR_32>;
+defm GLOBAL_LOAD_DWORD_ADDTID : FLAT_Global_Load_AddTid_Pseudo <"global_load_dword_addtid", VGPROp_32>;
 
 defm GLOBAL_STORE_BYTE    : FLAT_Global_Store_Pseudo_t16 <"global_store_byte">;
 defm GLOBAL_STORE_SHORT   : FLAT_Global_Store_Pseudo_t16 <"global_store_short">;
-defm GLOBAL_STORE_DWORD   : FLAT_Global_Store_Pseudo <"global_store_dword", VGPR_32>;
-defm GLOBAL_STORE_DWORDX2 : FLAT_Global_Store_Pseudo <"global_store_dwordx2", VReg_64>;
-defm GLOBAL_STORE_DWORDX3 : FLAT_Global_Store_Pseudo <"global_store_dwordx3", VReg_96>;
-defm GLOBAL_STORE_DWORDX4 : FLAT_Global_Store_Pseudo <"global_store_dwordx4", VReg_128>;
+defm GLOBAL_STORE_DWORD   : FLAT_Global_Store_Pseudo <"global_store_dword">;
+defm GLOBAL_STORE_DWORDX2 : FLAT_Global_Store_Pseudo <"global_store_dwordx2", AVLdSt_64>;
+defm GLOBAL_STORE_DWORDX3 : FLAT_Global_Store_Pseudo <"global_store_dwordx3", AVLdSt_96>;
+defm GLOBAL_STORE_DWORDX4 : FLAT_Global_Store_Pseudo <"global_store_dwordx4", AVLdSt_128>;
 let OtherPredicates = [HasGFX10_BEncoding] in
-defm GLOBAL_STORE_DWORD_ADDTID : FLAT_Global_Store_AddTid_Pseudo <"global_store_dword_addtid", VGPR_32>;
+defm GLOBAL_STORE_DWORD_ADDTID : FLAT_Global_Store_AddTid_Pseudo <"global_store_dword_addtid", VGPROp_32>;
 
 defm GLOBAL_ATOMIC_CMPSWAP : FLAT_Global_Atomic_Pseudo <"global_atomic_cmpswap",
-                               VGPR_32, i32, v2i32, VReg_64>;
+                               AVLdSt_32, i32, v2i32, AVLdSt_64>;
 
 defm GLOBAL_ATOMIC_CMPSWAP_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_cmpswap_x2",
-                                  VReg_64, i64, v2i64, VReg_128>;
+                                  AVLdSt_64, i64, v2i64, AVLdSt_128>;
 
 defm GLOBAL_ATOMIC_SWAP : FLAT_Global_Atomic_Pseudo <"global_atomic_swap",
-                             VGPR_32, i32>;
+                             AVLdSt_32, i32>;
 
 defm GLOBAL_ATOMIC_SWAP_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_swap_x2",
-                                VReg_64, i64>;
+                                AVLdSt_64, i64>;
 
 defm GLOBAL_ATOMIC_ADD : FLAT_Global_Atomic_Pseudo <"global_atomic_add",
-                           VGPR_32, i32>;
+                           AVLdSt_32, i32>;
 
 defm GLOBAL_ATOMIC_SUB : FLAT_Global_Atomic_Pseudo <"global_atomic_sub",
-                           VGPR_32, i32>;
+                           AVLdSt_32, i32>;
 
 defm GLOBAL_ATOMIC_SMIN : FLAT_Global_Atomic_Pseudo <"global_atomic_smin",
-                            VGPR_32, i32>;
+                            AVLdSt_32, i32>;
 
 defm GLOBAL_ATOMIC_UMIN : FLAT_Global_Atomic_Pseudo <"global_atomic_umin",
-                            VGPR_32, i32>;
+                            AVLdSt_32, i32>;
 
 defm GLOBAL_ATOMIC_SMAX : FLAT_Global_Atomic_Pseudo <"global_atomic_smax",
-                            VGPR_32, i32>;
+                            AVLdSt_32, i32>;
 
 defm GLOBAL_ATOMIC_UMAX : FLAT_Global_Atomic_Pseudo <"global_atomic_umax",
-                            VGPR_32, i32>;
+                            AVLdSt_32, i32>;
 
 defm GLOBAL_ATOMIC_AND : FLAT_Global_Atomic_Pseudo <"global_atomic_and",
-                           VGPR_32, i32>;
+                           AVLdSt_32, i32>;
 
 defm GLOBAL_ATOMIC_OR : FLAT_Global_Atomic_Pseudo <"global_atomic_or",
-                          VGPR_32, i32>;
+                          AVLdSt_32, i32>;
 
 defm GLOBAL_ATOMIC_XOR : FLAT_Global_Atomic_Pseudo <"global_atomic_xor",
-                           VGPR_32, i32>;
+                           AVLdSt_32, i32>;
 
 defm GLOBAL_ATOMIC_INC : FLAT_Global_Atomic_Pseudo <"global_atomic_inc",
-                           VGPR_32, i32>;
+                           AVLdSt_32, i32>;
 
 defm GLOBAL_ATOMIC_DEC : FLAT_Global_Atomic_Pseudo <"global_atomic_dec",
-                           VGPR_32, i32>;
+                           AVLdSt_32, i32>;
 
 defm GLOBAL_ATOMIC_ADD_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_add_x2",
-                              VReg_64, i64>;
+                              AVLdSt_64, i64>;
 
 defm GLOBAL_ATOMIC_SUB_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_sub_x2",
-                              VReg_64, i64>;
+                              AVLdSt_64, i64>;
 
 defm GLOBAL_ATOMIC_SMIN_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_smin_x2",
-                               VReg_64, i64>;
+                               AVLdSt_64, i64>;
 
 defm GLOBAL_ATOMIC_UMIN_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_umin_x2",
-                               VReg_64, i64>;
+                               AVLdSt_64, i64>;
 
 defm GLOBAL_ATOMIC_SMAX_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_smax_x2",
-                               VReg_64, i64>;
+                               AVLdSt_64, i64>;
 
 defm GLOBAL_ATOMIC_UMAX_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_umax_x2",
-                               VReg_64, i64>;
+                               AVLdSt_64, i64>;
 
 defm GLOBAL_ATOMIC_AND_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_and_x2",
-                              VReg_64, i64>;
+                              AVLdSt_64, i64>;
 
 defm GLOBAL_ATOMIC_OR_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_or_x2",
-                             VReg_64, i64>;
+                             AVLdSt_64, i64>;
 
 defm GLOBAL_ATOMIC_XOR_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_xor_x2",
-                              VReg_64, i64>;
+                              AVLdSt_64, i64>;
 
 defm GLOBAL_ATOMIC_INC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_inc_x2",
-                              VReg_64, i64>;
+                              AVLdSt_64, i64>;
 
 defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_dec_x2",
-                              VReg_64, i64>;
+                              AVLdSt_64, i64>;
 
 let SubtargetPredicate = HasGFX10_BEncoding in {
   defm GLOBAL_ATOMIC_CSUB : FLAT_Global_Atomic_Pseudo <"global_atomic_csub",
-                                VGPR_32, i32>;
+                                VGPROp_32, i32>;
 }
 
 defm GLOBAL_LOAD_LDS_UBYTE  : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_ubyte">;
@@ -1182,10 +1233,10 @@ defm GLOBAL_LOAD_LDS_DWORDX4 : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwo
 }
 
 let SubtargetPredicate = isGFX12PlusNot12_50 in
-  defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : FLAT_Global_Atomic_Pseudo <"global_atomic_ordered_add_b64", VReg_64, i64>;
+  defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : FLAT_Global_Atomic_Pseudo <"global_atomic_ordered_add_b64", VGPROp_64, i64>;
 
 let SubtargetPredicate = isGFX12Plus in {
-  defm GLOBAL_ATOMIC_COND_SUB_U32    : FLAT_Global_Atomic_Pseudo <"global_atomic_cond_sub_u32", VGPR_32, i32>;
+  defm GLOBAL_ATOMIC_COND_SUB_U32    : FLAT_Global_Atomic_Pseudo <"global_atomic_cond_sub_u32", VGPROp_32, i32>;
 
   def GLOBAL_INV    : FLAT_Global_Invalidate_Writeback<"global_inv">;
   def GLOBAL_WB     : FLAT_Global_Invalidate_Writeback<"global_wb">;
@@ -1194,6 +1245,12 @@ let SubtargetPredicate = isGFX12Plus in {
 
 let SubtargetPredicate = isGFX1250Plus in {
 
+let Uses = [M0, EXEC, ASYNCcnt], WaveSizePredicate = isWave32 in {
+defm CLUSTER_LOAD_ASYNC_TO_LDS_B8      :  FLAT_Global_Load_LDS_Pseudo<"cluster_load_async_to_lds_b8",   1>;
+defm CLUSTER_LOAD_ASYNC_TO_LDS_B32     :  FLAT_Global_Load_LDS_Pseudo<"cluster_load_async_to_lds_b32",  1>;
+defm CLUSTER_LOAD_ASYNC_TO_LDS_B64     :  FLAT_Global_Load_LDS_Pseudo<"cluster_load_async_to_lds_b64",  1>;
+defm CLUSTER_LOAD_ASYNC_TO_LDS_B128    :  FLAT_Global_Load_LDS_Pseudo<"cluster_load_async_to_lds_b128", 1>;
+} // End Uses = [M0, EXEC, ASYNCcnt], WaveSizePredicate = isWave32
 defm GLOBAL_LOAD_ASYNC_TO_LDS_B8       :  FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b8",    1>;
 defm GLOBAL_LOAD_ASYNC_TO_LDS_B32      :  FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b32",   1>;
 defm GLOBAL_LOAD_ASYNC_TO_LDS_B64      :  FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b64",   1>;
@@ -1207,33 +1264,33 @@ def TENSOR_SAVE : FLAT_Global_Tensor_Pseudo<"tensor_save", 1>;
 def TENSOR_STOP : FLAT_Global_Tensor_Pseudo<"tensor_stop">;
 } // End SubtargetPredicate = isGFX1250Plus
 
-defm SCRATCH_LOAD_UBYTE    : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte", VGPR_32>;
-defm SCRATCH_LOAD_SBYTE    : FLAT_Scratch_Load_Pseudo <"scratch_load_sbyte", VGPR_32>;
-defm SCRATCH_LOAD_USHORT   : FLAT_Scratch_Load_Pseudo <"scratch_load_ushort", VGPR_32>;
-defm SCRATCH_LOAD_SSHORT   : FLAT_Scratch_Load_Pseudo <"scratch_load_sshort", VGPR_32>;
-defm SCRATCH_LOAD_DWORD    : FLAT_Scratch_Load_Pseudo <"scratch_load_dword", VGPR_32>;
-defm SCRATCH_LOAD_DWORDX2  : FLAT_Scratch_Load_Pseudo <"scratch_load_dwordx2", VReg_64>;
-defm SCRATCH_LOAD_DWORDX3  : FLAT_Scratch_Load_Pseudo <"scratch_load_dwordx3", VReg_96>;
-defm SCRATCH_LOAD_DWORDX4  : FLAT_Scratch_Load_Pseudo <"scratch_load_dwordx4", VReg_128>;
+defm SCRATCH_LOAD_UBYTE    : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte">;
+defm SCRATCH_LOAD_SBYTE    : FLAT_Scratch_Load_Pseudo <"scratch_load_sbyte">;
+defm SCRATCH_LOAD_USHORT   : FLAT_Scratch_Load_Pseudo <"scratch_load_ushort">;
+defm SCRATCH_LOAD_SSHORT   : FLAT_Scratch_Load_Pseudo <"scratch_load_sshort">;
+defm SCRATCH_LOAD_DWORD    : FLAT_Scratch_Load_Pseudo <"scratch_load_dword">;
+defm SCRATCH_LOAD_DWORDX2  : FLAT_Scratch_Load_Pseudo <"scratch_load_dwordx2", AVLdSt_64>;
+defm SCRATCH_LOAD_DWORDX3  : FLAT_Scratch_Load_Pseudo <"scratch_load_dwordx3", AVLdSt_96>;
+defm SCRATCH_LOAD_DWORDX4  : FLAT_Scratch_Load_Pseudo <"scratch_load_dwordx4", AVLdSt_128>;
 
 let TiedSourceNotRead = 1 in {
-defm SCRATCH_LOAD_UBYTE_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte_d16_hi", VGPR_32, 1>;
-defm SCRATCH_LOAD_SBYTE_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_sbyte_d16_hi", VGPR_32, 1>;
-defm SCRATCH_LOAD_SHORT_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_short_d16_hi", VGPR_32, 1>;
+defm SCRATCH_LOAD_UBYTE_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte_d16_hi", AVLdSt_32, 1>;
+defm SCRATCH_LOAD_SBYTE_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_sbyte_d16_hi", AVLdSt_32, 1>;
+defm SCRATCH_LOAD_SHORT_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_short_d16_hi", AVLdSt_32, 1>;
 defm SCRATCH_LOAD_UBYTE_D16    : FLAT_Scratch_Load_Pseudo_t16 <"scratch_load_ubyte_d16">;
 defm SCRATCH_LOAD_SBYTE_D16    : FLAT_Scratch_Load_Pseudo_t16 <"scratch_load_sbyte_d16">;
 defm SCRATCH_LOAD_SHORT_D16    : FLAT_Scratch_Load_Pseudo_t16 <"scratch_load_short_d16">;
 }
 
-defm SCRATCH_STORE_BYTE_D16_HI : FLAT_Scratch_Store_Pseudo <"scratch_store_byte_d16_hi", VGPR_32>;
-defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Scratch_Store_Pseudo <"scratch_store_short_d16_hi", VGPR_32>;
+defm SCRATCH_STORE_BYTE_D16_HI : FLAT_Scratch_Store_Pseudo <"scratch_store_byte_d16_hi">;
+defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Scratch_Store_Pseudo <"scratch_store_short_d16_hi">;
 
 defm SCRATCH_STORE_BYTE    : FLAT_Scratch_Store_Pseudo_t16 <"scratch_store_byte">;
 defm SCRATCH_STORE_SHORT   : FLAT_Scratch_Store_Pseudo_t16 <"scratch_store_short">;
-defm SCRATCH_STORE_DWORD   : FLAT_Scratch_Store_Pseudo <"scratch_store_dword", VGPR_32>;
-defm SCRATCH_STORE_DWORDX2 : FLAT_Scratch_Store_Pseudo <"scratch_store_dwordx2", VReg_64>;
-defm SCRATCH_STORE_DWORDX3 : FLAT_Scratch_Store_Pseudo <"scratch_store_dwordx3", VReg_96>;
-defm SCRATCH_STORE_DWORDX4 : FLAT_Scratch_Store_Pseudo <"scratch_store_dwordx4", VReg_128>;
+defm SCRATCH_STORE_DWORD   : FLAT_Scratch_Store_Pseudo <"scratch_store_dword">;
+defm SCRATCH_STORE_DWORDX2 : FLAT_Scratch_Store_Pseudo <"scratch_store_dwordx2", AVLdSt_64>;
+defm SCRATCH_STORE_DWORDX3 : FLAT_Scratch_Store_Pseudo <"scratch_store_dwordx3", AVLdSt_96>;
+defm SCRATCH_STORE_DWORDX4 : FLAT_Scratch_Store_Pseudo <"scratch_store_dwordx4", AVLdSt_128>;
 
 defm SCRATCH_LOAD_LDS_UBYTE  : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_ubyte">;
 defm SCRATCH_LOAD_LDS_SBYTE  : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_sbyte">;
@@ -1242,69 +1299,77 @@ defm SCRATCH_LOAD_LDS_SSHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_s
 defm SCRATCH_LOAD_LDS_DWORD  : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_dword">;
 
 let SubtargetPredicate = isGFX125xOnly in {
-defm FLAT_LOAD_MONITOR_B32    : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b32",  VGPR_32>;
-defm FLAT_LOAD_MONITOR_B64    : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b64",  VReg_64>;
-defm FLAT_LOAD_MONITOR_B128   : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b128", VReg_128>;
+defm FLAT_LOAD_MONITOR_B32    : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b32",  VGPROp_32>;
+defm FLAT_LOAD_MONITOR_B64    : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b64",  VGPROp_64>;
+defm FLAT_LOAD_MONITOR_B128   : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b128", VGPROp_128>;
 
-defm GLOBAL_LOAD_MONITOR_B32  : FLAT_Global_Load_Pseudo <"global_load_monitor_b32",  VGPR_32>;
-defm GLOBAL_LOAD_MONITOR_B64  : FLAT_Global_Load_Pseudo <"global_load_monitor_b64",  VReg_64>;
-defm GLOBAL_LOAD_MONITOR_B128 : FLAT_Global_Load_Pseudo <"global_load_monitor_b128", VReg_128>;
+defm GLOBAL_LOAD_MONITOR_B32  : FLAT_Global_Load_Pseudo <"global_load_monitor_b32",  VGPROp_32>;
+defm GLOBAL_LOAD_MONITOR_B64  : FLAT_Global_Load_Pseudo <"global_load_monitor_b64",  VGPROp_64>;
+defm GLOBAL_LOAD_MONITOR_B128 : FLAT_Global_Load_Pseudo <"global_load_monitor_b128", VGPROp_128>;
 } // End SubtargetPredicate = isGFX125xOnly
 
+let SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32 in {
+let Uses = [M0, EXEC] in { // Use M0 for broadcast workgroup mask.
+defm CLUSTER_LOAD_B32         : FLAT_Global_Load_Pseudo <"cluster_load_b32",  VGPROp_32>;
+defm CLUSTER_LOAD_B64         : FLAT_Global_Load_Pseudo <"cluster_load_b64",  VGPROp_64>;
+defm CLUSTER_LOAD_B128        : FLAT_Global_Load_Pseudo <"cluster_load_b128", VGPROp_128>;
+} // End Uses = [M0, EXEC]
+} // End SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32
+
 let SubtargetPredicate = isGFX12Plus in {
   let Uses = [EXEC, M0] in {
-    defm GLOBAL_LOAD_BLOCK  : FLAT_Global_Load_Pseudo <"global_load_block", VReg_1024>;
-    defm GLOBAL_STORE_BLOCK  : FLAT_Global_Store_Pseudo <"global_store_block", VReg_1024>;
+    defm GLOBAL_LOAD_BLOCK  : FLAT_Global_Load_Pseudo <"global_load_block", VGPROp_1024>;
+    defm GLOBAL_STORE_BLOCK  : FLAT_Global_Store_Pseudo <"global_store_block", VGPROp_1024>;
   }
   let Uses = [EXEC, FLAT_SCR, M0] in {
-    defm SCRATCH_LOAD_BLOCK : FLAT_Scratch_Load_Pseudo <"scratch_load_block", VReg_1024>;
-    defm SCRATCH_STORE_BLOCK : FLAT_Scratch_Store_Pseudo <"scratch_store_block", VReg_1024>;
+    defm SCRATCH_LOAD_BLOCK : FLAT_Scratch_Load_Pseudo <"scratch_load_block", VGPROp_1024>;
+    defm SCRATCH_STORE_BLOCK : FLAT_Scratch_Store_Pseudo <"scratch_store_block", VGPROp_1024>;
   }
 
   let WaveSizePredicate = isWave32 in {
-    defm GLOBAL_LOAD_TR_B128_w32  : FLAT_Global_Load_Pseudo <"global_load_tr_b128", VReg_128>;
-    defm GLOBAL_LOAD_TR_B64_w32   : FLAT_Global_Load_Pseudo <"global_load_tr_b64", VReg_64>;
+    defm GLOBAL_LOAD_TR_B128_w32  : FLAT_Global_Load_Pseudo <"global_load_tr_b128", VGPROp_128>;
+    defm GLOBAL_LOAD_TR_B64_w32   : FLAT_Global_Load_Pseudo <"global_load_tr_b64", VGPROp_64>;
   }
 } // End SubtargetPredicate = isGFX12Plus
 
 let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12PlusNot12_50 in {
   let Mnemonic = "global_load_tr_b128" in
-  defm GLOBAL_LOAD_TR_B128_w64  : FLAT_Global_Load_Pseudo <"global_load_tr_b128_w64", VReg_64>;
+  defm GLOBAL_LOAD_TR_B128_w64  : FLAT_Global_Load_Pseudo <"global_load_tr_b128_w64", VGPROp_64>;
   let Mnemonic = "global_load_tr_b64" in
-  defm GLOBAL_LOAD_TR_B64_w64   : FLAT_Global_Load_Pseudo <"global_load_tr_b64_w64", VGPR_32>;
+  defm GLOBAL_LOAD_TR_B64_w64   : FLAT_Global_Load_Pseudo <"global_load_tr_b64_w64", VGPROp_32>;
 }
 
 let WaveSizePredicate = isWave32, SubtargetPredicate = HasTransposeLoadF4F6Insts in {
-  defm GLOBAL_LOAD_TR6_B96 : FLAT_Global_Load_Pseudo <"global_load_tr6_b96", VReg_96>;
-  defm GLOBAL_LOAD_TR4_B64 : FLAT_Global_Load_Pseudo <"global_load_tr4_b64", VReg_64>;
+  defm GLOBAL_LOAD_TR6_B96 : FLAT_Global_Load_Pseudo <"global_load_tr6_b96", VGPROp_96>;
+  defm GLOBAL_LOAD_TR4_B64 : FLAT_Global_Load_Pseudo <"global_load_tr4_b64", VGPROp_64>;
 }
 
 let SubtargetPredicate = isGFX10Plus in {
   defm GLOBAL_ATOMIC_FCMPSWAP :
-    FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap", VGPR_32, f32, v2f32, VReg_64>;
+    FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap", AVLdSt_32, f32, v2f32, AVLdSt_64>;
   defm GLOBAL_ATOMIC_FMIN :
-    FLAT_Global_Atomic_Pseudo<"global_atomic_fmin", VGPR_32, f32>;
+    FLAT_Global_Atomic_Pseudo<"global_atomic_fmin", AVLdSt_32, f32>;
   defm GLOBAL_ATOMIC_FMAX :
-    FLAT_Global_Atomic_Pseudo<"global_atomic_fmax", VGPR_32, f32>;
+    FLAT_Global_Atomic_Pseudo<"global_atomic_fmax", AVLdSt_32, f32>;
   defm GLOBAL_ATOMIC_FCMPSWAP_X2 :
-    FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap_x2", VReg_64, f64, v2f64, VReg_128>;
+    FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap_x2", AVLdSt_64, f64, v2f64, AVLdSt_128>;
 } // End SubtargetPredicate = isGFX10Plus
 
-let OtherPredicates = [HasAtomicFaddNoRtnInsts] in
+let SubtargetPredicate = HasAtomicFaddNoRtnInsts in
   defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN <
-    "global_atomic_add_f32", VGPR_32, f32
+    "global_atomic_add_f32", AVLdSt_32, f32
   >;
-let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in
+let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16NoRtnInsts in
   defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN <
-    "global_atomic_pk_add_f16", VGPR_32, v2f16
+    "global_atomic_pk_add_f16", AVLdSt_32, v2f16
   >;
-let OtherPredicates = [HasAtomicFaddRtnInsts] in
+let SubtargetPredicate = HasAtomicFaddRtnInsts in
   defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_RTN <
-    "global_atomic_add_f32", VGPR_32, f32
+    "global_atomic_add_f32", AVLdSt_32, f32
   >;
-let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in
+let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts in
   defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_RTN <
-    "global_atomic_pk_add_f16", VGPR_32, v2f16
+    "global_atomic_pk_add_f16", AVLdSt_32, v2f16
   >;
 
 let SubtargetPredicate = HasVmemPrefInsts in {
@@ -1362,6 +1427,16 @@ class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueT
   (inst $saddr, $voffset, $offset, $cpol)
 >;
 
+class FlatLoadLDSSignedPat_M0 <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
+  (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol), M0),
+  (inst $dsaddr, $vaddr, $offset, $cpol)
+>;
+
+class GlobalLoadLDSSaddrPat_M0 <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
+  (node (GlobalSAddrNoIOffsetM0 (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm), M0),
+  (inst $dsaddr, $saddr, $voffset, $offset, $cpol)
+>;
+
 class FlatLoadLDSSignedPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
   (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)),
   (inst $dsaddr, $vaddr, $offset, $cpol)
@@ -1397,6 +1472,16 @@ class FlatLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt>
   (inst $saddr, $voffset, $offset, $cpol)
 >;
 
+class FlatLoadSignedPat_M0 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+  (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), (i32 timm:$cpol), M0)),
+  (inst $vaddr, $offset, $cpol)
+>;
+
+class GlobalLoadSaddrPat_M0 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+  (vt (node (GlobalSAddrCPolM0 (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), (i32 timm), M0)),
+  (inst $saddr, $voffset, $offset, $cpol)
+>;
+
 class FlatLoadSignedPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
   (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), (i32 timm:$cpol))),
   (inst $vaddr, $offset, $cpol)
@@ -1416,8 +1501,10 @@ class FlatStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
 class FlatAtomicSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ComplexPattern pat,
                           ValueType vt, ValueType data_vt = vt> : GCNPat <
   (vt (node (pat (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), data_vt:$data)),
-  (inst $voffset, getVregSrcForVT<data_vt>.ret:$data, $saddr, $offset, $cpol)
->;
+  (inst $voffset, getVregSrcForVT<data_vt>.ret:$data, $saddr, $offset, $cpol)> {
+  let SubtargetPredicate = inst.SubtargetPredicate;
+  let OtherPredicates = inst.OtherPredicates;
+}
 
 class GlobalAtomicNoRtnSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
                                  ValueType vt> : GCNPat <
@@ -1443,19 +1530,24 @@ class FlatStoreSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node,
   (inst $vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)
 >;
 
-multiclass FlatAtomicNoRtnPatBase <string inst, string node, ValueType vt,
+multiclass FlatAtomicNoRtnPatBase <string base_inst_name, string node, ValueType vt,
                                    ValueType data_vt = vt> {
-
+  defvar inst = !cast<FLAT_Pseudo>(base_inst_name);
+  defvar inst_saddr = !cast<FLAT_Pseudo>(inst#"_SADDR");
   defvar noRtnNode = !cast<PatFrags>(node);
 
   let AddedComplexity = 1 in
   def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)),
-    (!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
+    (inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> {
+    let SubtargetPredicate = inst.SubtargetPredicate;
+    let OtherPredicates = inst.OtherPredicates;
+  }
 
-  def : FlatAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), !cast<SDPatternOperator>(node),
+  def : FlatAtomicSaddrPat<inst_saddr, !cast<SDPatternOperator>(node),
                            GlobalSAddr, vt, data_vt> {
     let AddedComplexity = 9;
-    let SubtargetPredicate = HasFlatGVSMode;
+    let SubtargetPredicate = inst_saddr.SubtargetPredicate;
+    let OtherPredicates = inst_saddr.OtherPredicates;
   }
 }
 
@@ -1468,17 +1560,22 @@ multiclass FlatAtomicNoRtnPat <string inst, string node, ValueType vt,
   FlatAtomicNoRtnPatBase<inst, node # "_noret" # !if(isIntr, "", "_"#vt), vt, data_vt>;
 
 
-multiclass FlatAtomicRtnPatBase <string inst, string node, ValueType vt,
+multiclass FlatAtomicRtnPatBase <string inst_name, string node, ValueType vt,
                                  ValueType data_vt = vt> {
-
+  defvar inst = !cast<FLAT_Pseudo>(inst_name#"_RTN");
+  defvar inst_saddr = !cast<FLAT_Pseudo>(inst_name#"_SADDR_RTN");
   defvar rtnNode = !cast<SDPatternOperator>(node);
 
   def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)),
-    (!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
+    (inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> {
+    let SubtargetPredicate = inst.SubtargetPredicate;
+    let OtherPredicates = inst.OtherPredicates;
+  }
 
-  def : FlatAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_RTN"), rtnNode, GlobalSAddrGLC, vt, data_vt> {
+  def : FlatAtomicSaddrPat<inst_saddr, rtnNode, GlobalSAddrGLC, vt, data_vt> {
     let AddedComplexity = 8;
-    let SubtargetPredicate = HasFlatGVSMode;
+    let SubtargetPredicate = inst_saddr.SubtargetPredicate;
+    let OtherPredicates = inst_saddr.OtherPredicates;
   }
 }
 
@@ -1514,8 +1611,10 @@ multiclass FlatAtomicIntrPat <string inst, string node, ValueType vt,
 class FlatSignedAtomicPatBase <FLAT_Pseudo inst, SDPatternOperator node,
                                ValueType vt, ValueType data_vt = vt> : GCNPat <
   (vt (node (GlobalOffset i64:$vaddr, i32:$offset), data_vt:$data)),
-  (inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)
->;
+  (inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> {
+  let SubtargetPredicate = inst.SubtargetPredicate;
+  let OtherPredicates = inst.OtherPredicates;
+}
 
 multiclass FlatSignedAtomicPat <string inst, string node, ValueType vt,
                                 ValueType data_vt = vt, int complexity = 0,
@@ -1592,6 +1691,16 @@ class ScratchLoadSVaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Va
   (inst $vaddr, $saddr, $offset, $cpol)
 >;
 
+multiclass GlobalLoadLDSPats_M0<FLAT_Pseudo inst, SDPatternOperator node> {
+  def : FlatLoadLDSSignedPat_M0 <inst, node> {
+    let AddedComplexity = 10;
+  }
+
+  def : GlobalLoadLDSSaddrPat_M0<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node> {
+    let AddedComplexity = 11;
+  }
+}
+
 multiclass GlobalLoadLDSPats<FLAT_Pseudo inst, SDPatternOperator node> {
   def : FlatLoadLDSSignedPat <inst, node> {
     let AddedComplexity = 10;
@@ -1615,20 +1724,42 @@ multiclass GlobalStoreLDSPats<FLAT_Pseudo inst, SDPatternOperator node> {
 multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
   def : FlatLoadSignedPat <inst, node, vt> {
     let AddedComplexity = 10;
+    let SubtargetPredicate = inst.SubtargetPredicate;
+    let OtherPredicates = inst.OtherPredicates;
   }
 
   def : FlatLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
     let AddedComplexity = 11;
+    let SubtargetPredicate = inst.SubtargetPredicate;
+    let OtherPredicates = inst.OtherPredicates;
+  }
+}
+
+multiclass GlobalFLATLoadPats_M0<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+  def : FlatLoadSignedPat_M0 <inst, node, vt> {
+    let AddedComplexity = 10;
+    let SubtargetPredicate = inst.SubtargetPredicate;
+    let OtherPredicates = inst.OtherPredicates;
+  }
+
+  def : GlobalLoadSaddrPat_M0<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+    let AddedComplexity = 11;
+    let SubtargetPredicate = inst.SubtargetPredicate;
+    let OtherPredicates = inst.OtherPredicates;
   }
 }
 
 multiclass GlobalFLATLoadPats_CPOL<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
   def : FlatLoadSignedPat_CPOL<inst, node, vt> {
     let AddedComplexity = 10;
+    let SubtargetPredicate = inst.SubtargetPredicate;
+    let OtherPredicates = inst.OtherPredicates;
   }
 
   def : GlobalLoadSaddrPat_CPOL<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
     let AddedComplexity = 11;
+    let SubtargetPredicate = inst.SubtargetPredicate;
+    let OtherPredicates = inst.OtherPredicates;
   }
 }
 
@@ -1655,10 +1786,14 @@ multiclass GlobalFLATLoadPats_D16_t16<string inst, SDPatternOperator node, Value
 multiclass GlobalFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node,
                                ValueType vt> {
   def : FlatStoreSignedPat <inst, node, vt> {
+    let SubtargetPredicate = inst.SubtargetPredicate;
+    let OtherPredicates = inst.OtherPredicates;
     let AddedComplexity = 10;
   }
 
   def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+    let SubtargetPredicate = inst.SubtargetPredicate;
+    let OtherPredicates = inst.OtherPredicates;
     let AddedComplexity = 11;
   }
 }
@@ -1803,7 +1938,9 @@ multiclass ScratchFLATLoadPats_D16_t16<string inst, SDPatternOperator node, Valu
 }
 
 multiclass FlatLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
-  def : FlatLoadPat <inst, node, vt>;
+  def : FlatLoadPat <inst, node, vt> {
+    let OtherPredicates = [HasFlatAddressSpace];
+  }
 
   def : FlatLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
     let AddedComplexity = 9;
@@ -1830,7 +1967,9 @@ multiclass FlatLoadPats_D16_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueT
 }
 
 multiclass FlatStorePats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
-  def : FlatStorePat <inst, node, vt>;
+  def : FlatStorePat <inst, node, vt> {
+    let OtherPredicates = [HasFlatAddressSpace];
+  }
 
   def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
     let AddedComplexity = 9;
@@ -1847,8 +1986,6 @@ multiclass FlatStorePats_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType
   }
 }
 
-let OtherPredicates = [HasFlatAddressSpace] in {
-
 defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i32>;
 defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i32>;
 defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_aext_16_flat, i32>;
@@ -1898,6 +2035,7 @@ let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predi
 defm : FlatLoadPats <FLAT_LOAD_DWORD, atomic_load_nonext_32_flat, i32>;
 defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, i64>;
 defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, v2i32>;
+defm : FlatLoadPats <FLAT_LOAD_DWORDX4, atomic_load_nonext_128_flat, v4i32>;
 
 defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i32>;
 defm : FlatStorePats <FLAT_STORE_SHORT, truncstorei16_flat, i32>;
@@ -1922,6 +2060,7 @@ defm : FlatStorePats <FLAT_STORE_DWORDX4, store_flat, vt>;
 defm : FlatStorePats <FLAT_STORE_DWORD, atomic_store_32_flat, i32>;
 defm : FlatStorePats <FLAT_STORE_DWORDX2, atomic_store_64_flat, i64>;
 defm : FlatStorePats <FLAT_STORE_DWORDX2, atomic_store_64_flat, v2i32>;
+defm : FlatStorePats <FLAT_STORE_DWORDX4, atomic_store_128_flat, v4i32>;
 defm : FlatStorePats <FLAT_STORE_BYTE, atomic_store_8_flat, i32>;
 defm : FlatStorePats <FLAT_STORE_SHORT, atomic_store_16_flat, i32>;
 
@@ -1970,12 +2109,7 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>;
 defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
 defm : FlatStorePats <FLAT_STORE_SHORT, store_flat, i16>;
 
-} // End OtherPredicates = [HasFlatAddressSpace]
-
-let OtherPredicates = [isGFX12Plus] in
 defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>;
-
-let OtherPredicates = [isGFX12Plus, HasAtomicCSubNoRtnInsts] in
 defm : FlatAtomicNoRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>;
 
 let OtherPredicates = [HasD16LoadStore] in {
@@ -2000,8 +2134,6 @@ defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2i16>;
 defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>;
 }
 
-let OtherPredicates = [HasFlatGlobalInsts] in {
-
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_aext_8_global, i32>;
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_zext_8_global, i32>;
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_aext_16_global, i32>;
@@ -2015,7 +2147,7 @@ defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, zextloadi16_global, i32>;
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_SSHORT, sextloadi16_global, i32>;
 
 foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
-let OtherPredicates = [HasFlatGlobalInsts], True16Predicate = p in {
+let True16Predicate = p in {
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, extloadi8_global, i16>;
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, zextloadi8_global, i16>;
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_SBYTE, sextloadi8_global, i16>;
@@ -2029,7 +2161,7 @@ defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_nonext_16_global, i16
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_zext_16_global, i16>;
 }
 
-let OtherPredicates = [HasFlatGlobalInsts, D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in {
+let OtherPredicates = [D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in {
 defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", extloadi8_global, i16>;
 defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", zextloadi8_global, i16>;
 defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SBYTE_D16", sextloadi8_global, i16>;
@@ -2068,6 +2200,7 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX4, store_global, vt>;
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORD, atomic_load_nonext_32_global, i32>;
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX2, atomic_load_nonext_64_global, i64>;
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX2, atomic_load_nonext_64_global, v2i32>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX4, atomic_load_nonext_128_global, v4i32>;
 
 defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, truncstorei8_global, i32>;
 defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, truncstorei16_global, i32>;
@@ -2108,6 +2241,7 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i32>;
 defm : GlobalFLATStorePats <GLOBAL_STORE_DWORD, atomic_store_32_global, i32>;
 defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, i64>;
 defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, v2i32>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX4, atomic_store_128_global, v4i32>;
 
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD", "atomic_load_add_global", i32>;
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB", "atomic_load_sub_global", i32>;
@@ -2124,7 +2258,7 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP", "AMDGPUatomic_cmp_swap_glo
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR", "atomic_load_xor_global", i32>;
 defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>;
 
-let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
+let SubtargetPredicate = HasAtomicCSubNoRtnInsts in
 defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>;
 
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_X2", "atomic_load_add_global", i64>;
@@ -2144,7 +2278,7 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i
 let SubtargetPredicate = isGFX12Plus in {
   defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>;
 
-  let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
+  let SubtargetPredicate = HasAtomicCSubNoRtnInsts in
     defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace  <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace",  i32>;
 }
 
@@ -2179,6 +2313,15 @@ let OtherPredicates = [isGFX125xOnly] in {
 } // End SubtargetPredicate = isGFX125xOnly
 
 let OtherPredicates = [isGFX1250Plus] in {
+  defm : GlobalFLATLoadPats_M0 <CLUSTER_LOAD_B32,  int_amdgcn_cluster_load_b32,  i32>;
+  defm : GlobalFLATLoadPats_M0 <CLUSTER_LOAD_B64,  int_amdgcn_cluster_load_b64,  v2i32>;
+  defm : GlobalFLATLoadPats_M0 <CLUSTER_LOAD_B128, int_amdgcn_cluster_load_b128, v4i32>;
+
+  defm : GlobalLoadLDSPats_M0 <CLUSTER_LOAD_ASYNC_TO_LDS_B8,   int_amdgcn_cluster_load_async_to_lds_b8>;
+  defm : GlobalLoadLDSPats_M0 <CLUSTER_LOAD_ASYNC_TO_LDS_B32,  int_amdgcn_cluster_load_async_to_lds_b32>;
+  defm : GlobalLoadLDSPats_M0 <CLUSTER_LOAD_ASYNC_TO_LDS_B64,  int_amdgcn_cluster_load_async_to_lds_b64>;
+  defm : GlobalLoadLDSPats_M0 <CLUSTER_LOAD_ASYNC_TO_LDS_B128, int_amdgcn_cluster_load_async_to_lds_b128>;
+
   defm : GlobalLoadLDSPats  <GLOBAL_LOAD_ASYNC_TO_LDS_B8,      int_amdgcn_global_load_async_to_lds_b8>;
   defm : GlobalLoadLDSPats  <GLOBAL_LOAD_ASYNC_TO_LDS_B32,     int_amdgcn_global_load_async_to_lds_b32>;
   defm : GlobalLoadLDSPats  <GLOBAL_LOAD_ASYNC_TO_LDS_B64,     int_amdgcn_global_load_async_to_lds_b64>;
@@ -2190,62 +2333,38 @@ let OtherPredicates = [isGFX1250Plus] in {
   defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B128, int_amdgcn_global_store_async_from_lds_b128>;
 }
 
-let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in {
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>;
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>;
-}
-
-let SubtargetPredicate = HasAtomicFMinFMaxF32FlatInsts in {
 defm : FlatAtomicPat <"FLAT_ATOMIC_FMIN", "atomic_load_fmin_flat", f32>;
 defm : FlatAtomicPat <"FLAT_ATOMIC_FMAX", "atomic_load_fmax_flat", f32>;
-}
 
-let OtherPredicates = [isGFX12Only] in {
-  // FIXME: Remove these intrinsics
-  defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin_num", f32>;
-  defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax_num", f32>;
-  defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin_num", f32>;
-  defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax_num", f32>;
+// FIXME: Remove these intrinsics
+let SubtargetPredicate = isGFX12Only in {
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin_num", f32>;
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax_num", f32>;
+defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin_num", f32>;
+defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax_num", f32>;
 }
 
-let OtherPredicates = [HasAtomicFaddNoRtnInsts] in {
 defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>;
-}
 
-let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in {
 defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>;
-}
 
-let OtherPredicates = [HasAtomicFaddRtnInsts] in {
 defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>;
-}
 
-let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in {
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>;
-}
 
-let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in {
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", "atomic_load_fmin_global", f64>;
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", "atomic_load_fmax_global", f64>;
-}
 
-let OtherPredicates = [HasFlatBufferGlobalAtomicFaddF64Inst] in {
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>;
 defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>;
-}
 
-let OtherPredicates = [HasFlatAtomicFaddF32Inst] in {
 defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F32", "atomic_load_fadd_flat", f32>;
-}
-
-let OtherPredicates = [HasAtomicFlatPkAdd16Insts] in {
 defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_flat", v2f16>;
 defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_flat", v2bf16>;
-}
 
-let OtherPredicates = [HasAtomicGlobalPkAddBF16Inst] in
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_global", v2bf16>;
-} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10
 
 let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in {
 
@@ -2566,6 +2685,7 @@ multiclass FLAT_Real_Atomics_vi <bits<7> op,
   defvar ps = !cast<FLAT_Pseudo>(NAME);
   def _vi     : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr), has_sccb>;
   def _RTN_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN"), has_sccb>;
+  def _RTN_agpr_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN_agpr"), has_sccb>;
 }
 
 multiclass FLAT_Global_Real_Atomics_vi<bits<7> op,
@@ -2573,8 +2693,10 @@ multiclass FLAT_Global_Real_Atomics_vi<bits<7> op,
   FLAT_Real_AllAddr_vi<op, has_sccb> {
   def _RTN_vi  : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_RTN"), has_sccb>;
   def _SADDR_RTN_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN"), has_sccb>;
-}
 
+  def _RTN_agpr_vi  : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_RTN_agpr"), has_sccb>;
+  def _SADDR_RTN_agpr_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN_agpr"), has_sccb>;
+}
 
 defm FLAT_ATOMIC_SWAP       : FLAT_Real_Atomics_vi <0x40>;
 defm FLAT_ATOMIC_CMPSWAP    : FLAT_Real_Atomics_vi <0x41>;
@@ -3473,6 +3595,14 @@ defm GLOBAL_LOAD_MONITOR_B32          : VFLAT_Real_AllAddr_gfx1250<0x070>;
 defm GLOBAL_LOAD_MONITOR_B64          : VFLAT_Real_AllAddr_gfx1250<0x071>;
 defm GLOBAL_LOAD_MONITOR_B128         : VFLAT_Real_AllAddr_gfx1250<0x072>;
 
+defm CLUSTER_LOAD_B32                 : VFLAT_Real_AllAddr_gfx1250<0x067>;
+defm CLUSTER_LOAD_B64                 : VFLAT_Real_AllAddr_gfx1250<0x068>;
+defm CLUSTER_LOAD_B128                : VFLAT_Real_AllAddr_gfx1250<0x069>;
+
+defm CLUSTER_LOAD_ASYNC_TO_LDS_B8     : VFLAT_Real_AllAddr_gfx1250<0x6a>;
+defm CLUSTER_LOAD_ASYNC_TO_LDS_B32    : VFLAT_Real_AllAddr_gfx1250<0x6b>;
+defm CLUSTER_LOAD_ASYNC_TO_LDS_B64    : VFLAT_Real_AllAddr_gfx1250<0x6c>;
+defm CLUSTER_LOAD_ASYNC_TO_LDS_B128   : VFLAT_Real_AllAddr_gfx1250<0x6d>;
 defm GLOBAL_LOAD_ASYNC_TO_LDS_B8      : VFLAT_Real_AllAddr_gfx1250<0x5f>;
 defm GLOBAL_LOAD_ASYNC_TO_LDS_B32     : VFLAT_Real_AllAddr_gfx1250<0x60>;
 defm GLOBAL_LOAD_ASYNC_TO_LDS_B64     : VFLAT_Real_AllAddr_gfx1250<0x61>;
diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index 184929a5a50f..8821558bb023 100644
--- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -193,16 +193,6 @@ MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const {
   return &OldOpnd;
 }
 
-[[maybe_unused]] static unsigned getOperandSize(MachineInstr &MI, unsigned Idx,
-                               MachineRegisterInfo &MRI) {
-  int16_t RegClass = MI.getDesc().operands()[Idx].RegClass;
-  if (RegClass == -1)
-    return 0;
-
-  const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
-  return TRI->getRegSizeInBits(*TRI->getRegClass(RegClass));
-}
-
 MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
                                            MachineInstr &MovMI,
                                            RegSubRegPair CombOldVGPR,
@@ -250,7 +240,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
       ++NumOperands;
     }
     if (auto *SDst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::sdst)) {
-      if (TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, SDst)) {
+      if (AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::sdst)) {
         DPPInst.add(*SDst);
         ++NumOperands;
       }
@@ -295,12 +285,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
     }
     auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
     assert(Src0);
-    int Src0Idx = NumOperands;
-    if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) {
-      LLVM_DEBUG(dbgs() << "  failed: src0 is illegal\n");
-      Fail = true;
-      break;
-    }
+    [[maybe_unused]] int Src0Idx = NumOperands;
+
     DPPInst.add(*Src0);
     DPPInst->getOperand(NumOperands).setIsKill(false);
     ++NumOperands;
@@ -319,21 +305,17 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
     }
     auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
     if (Src1) {
-      int OpNum = NumOperands;
+      assert(AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::src1) &&
+             "dpp version of instruction missing src1");
       // If subtarget does not support SGPRs for src1 operand then the
       // requirements are the same as for src0. We check src0 instead because
       // pseudos are shared between subtargets and allow SGPR for src1 on all.
       if (!ST->hasDPPSrc1SGPR()) {
-        assert(getOperandSize(*DPPInst, Src0Idx, *MRI) ==
-                   getOperandSize(*DPPInst, NumOperands, *MRI) &&
+        assert(TII->getOpSize(*DPPInst, Src0Idx) ==
+                   TII->getOpSize(*DPPInst, NumOperands) &&
                "Src0 and Src1 operands should have the same size");
-        OpNum = Src0Idx;
-      }
-      if (!TII->isOperandLegal(*DPPInst.getInstr(), OpNum, Src1)) {
-        LLVM_DEBUG(dbgs() << "  failed: src1 is illegal\n");
-        Fail = true;
-        break;
       }
+
       DPPInst.add(*Src1);
       ++NumOperands;
     }
@@ -349,9 +331,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
     }
     auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2);
     if (Src2) {
-      if (!TII->getNamedOperand(*DPPInst.getInstr(), AMDGPU::OpName::src2) ||
-          !TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) {
-        LLVM_DEBUG(dbgs() << "  failed: src2 is illegal\n");
+      if (!AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::src2)) {
+        LLVM_DEBUG(dbgs() << "  failed: dpp does not have src2\n");
         Fail = true;
         break;
       }
@@ -431,6 +412,24 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
     DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask));
     DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask));
     DPPInst.addImm(CombBCZ ? 1 : 0);
+
+    constexpr AMDGPU::OpName Srcs[] = {
+        AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
+
+    // FIXME: isOperandLegal expects to operate on an completely built
+    // instruction. We should have better legality APIs to check if the
+    // candidate operands will be legal without building the instruction first.
+    for (auto [I, OpName] : enumerate(Srcs)) {
+      int OpIdx = AMDGPU::getNamedOperandIdx(DPPOp, OpName);
+      if (OpIdx == -1)
+        break;
+
+      if (!TII->isOperandLegal(*DPPInst, OpIdx)) {
+        LLVM_DEBUG(dbgs() << "  failed: src" << I << " operand is illegal\n");
+        Fail = true;
+        break;
+      }
+    }
   } while (false);
 
   if (Fail) {
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 931966b6df1d..7b94ea3ffbf1 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -577,6 +577,7 @@ GCNSubtarget::getMaxNumVectorRegs(const Function &F) const {
 
   unsigned MaxNumVGPRs = MaxVectorRegs;
   unsigned MaxNumAGPRs = 0;
+  unsigned NumArchVGPRs = has1024AddressableVGPRs() ? 1024 : 256;
 
   // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,
   // a wave may have up to 512 total vector registers combining together both
@@ -589,7 +590,6 @@ GCNSubtarget::getMaxNumVectorRegs(const Function &F) const {
   if (hasGFX90AInsts()) {
     unsigned MinNumAGPRs = 0;
     const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs();
-    const unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
 
     const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u};
 
@@ -614,11 +614,11 @@ GCNSubtarget::getMaxNumVectorRegs(const Function &F) const {
     MaxNumAGPRs = std::min(std::max(MinNumAGPRs, MaxNumAGPRs), MaxVectorRegs);
     MinNumAGPRs = std::min(std::min(MinNumAGPRs, TotalNumAGPRs), MaxNumAGPRs);
 
-    MaxNumVGPRs = std::min(MaxVectorRegs - MinNumAGPRs, TotalNumVGPRs);
+    MaxNumVGPRs = std::min(MaxVectorRegs - MinNumAGPRs, NumArchVGPRs);
     MaxNumAGPRs = std::min(MaxVectorRegs - MaxNumVGPRs, MaxNumAGPRs);
 
     assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs &&
-           MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= TotalNumVGPRs &&
+           MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= NumArchVGPRs &&
            "invalid register counts");
   } else if (hasMAIInsts()) {
     // On gfx908 the number of AGPRs always equals the number of VGPRs.
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 2a8385df3f93..cbd6f64976d2 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -198,6 +198,7 @@ protected:
   bool DynamicVGPR = false;
   bool DynamicVGPRBlockSize32 = false;
   bool HasVMemToLDSLoad = false;
+  bool RequiresAlignVGPR = false;
 
   // This should not be used directly. 'TargetID' tracks the dynamic settings
   // for SRAMECC.
@@ -235,6 +236,7 @@ protected:
   bool HasPseudoScalarTrans = false;
   bool HasRestrictedSOffset = false;
   bool Has64BitLiterals = false;
+  bool Has1024AddressableVGPRs = false;
   bool HasBitOp3Insts = false;
   bool HasTanhInsts = false;
   bool HasTensorCvtLutInsts = false;
@@ -250,7 +252,6 @@ protected:
   bool HasVmemPrefInsts = false;
   bool HasSafeSmemPrefetch = false;
   bool HasSafeCUPrefetch = false;
-  bool HasCUStores = false;
   bool HasVcmpxExecWARHazard = false;
   bool HasLdsBranchVmemWARHazard = false;
   bool HasNSAtoVMEMBug = false;
@@ -1015,8 +1016,6 @@ public:
 
   bool hasSafeCUPrefetch() const { return HasSafeCUPrefetch; }
 
-  bool hasCUStores() const { return HasCUStores; }
-
   // Has s_cmpk_* instructions.
   bool hasSCmpK() const { return getGeneration() < GFX12; }
 
@@ -1350,7 +1349,7 @@ public:
   }
 
   /// Return if operations acting on VGPR tuples require even alignment.
-  bool needsAlignedVGPRs() const { return GFX90AInsts || GFX1250Insts; }
+  bool needsAlignedVGPRs() const { return RequiresAlignVGPR; }
 
   /// Return true if the target has the S_PACK_HL_B32_B16 instruction.
   bool hasSPackHL() const { return GFX11Insts; }
@@ -1436,6 +1435,8 @@ public:
 
   bool hasAddPC64Inst() const { return GFX1250Insts; }
 
+  bool has1024AddressableVGPRs() const { return Has1024AddressableVGPRs; }
+
   bool hasMinimum3Maximum3PKF16() const {
     return HasMinimum3Maximum3PKF16;
   }
@@ -1831,6 +1832,13 @@ public:
   bool hasScratchBaseForwardingHazard() const {
     return GFX1250Insts && getGeneration() == GFX12;
   }
+
+  /// \returns true if the subtarget supports clusters of workgroups.
+  bool hasClusters() const { return GFX1250Insts; }
+
+  /// \returns true if the subtarget requires a wait for xcnt before atomic
+  /// flat/global stores & rmw.
+  bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; }
 };
 
 class GCNUserSGPRUsageInfo {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index aafbdc2e86a9..f098e7a3c6c6 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -80,12 +80,9 @@ void AMDGPUInstPrinter::printFP64ImmOperand(const MCInst *MI, unsigned OpNo,
                                             const MCSubtargetInfo &STI,
                                             raw_ostream &O) {
   // KIMM64
-  // This part needs to align with AMDGPUInstPrinter::printImmediate64.
+  const MCInstrDesc &Desc = MII.get(MI->getOpcode());
   uint64_t Imm = MI->getOperand(OpNo).getImm();
-  if (STI.hasFeature(AMDGPU::Feature64BitLiterals) && Lo_32(Imm))
-    O << "lit64(" << formatHex(static_cast<uint64_t>(Imm)) << ')';
-  else
-    O << formatHex(static_cast<uint64_t>(Hi_32(Imm)));
+  printLiteral64(Desc, Imm, STI, O, /*IsFP=*/true);
 }
 
 void AMDGPUInstPrinter::printNamedBit(const MCInst *MI, unsigned OpNo,
@@ -327,6 +324,54 @@ void AMDGPUInstPrinter::printSymbolicFormat(const MCInst *MI,
   }
 }
 
+// \returns a low 256 vgpr representing a high vgpr \p Reg [v256..v1023] or
+// \p Reg itself otherwise.
+static MCPhysReg getRegForPrinting(MCPhysReg Reg, const MCRegisterInfo &MRI) {
+  unsigned Enc = MRI.getEncodingValue(Reg);
+  unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK;
+  if (Idx < 0x100)
+    return Reg;
+
+  const MCRegisterClass *RC = getVGPRPhysRegClass(Reg, MRI);
+  return RC->getRegister(Idx % 0x100);
+}
+
+// Restore MSBs of a VGPR above 255 from the MCInstrAnalysis.
+static MCPhysReg getRegFromMIA(MCPhysReg Reg, unsigned OpNo,
+                               const MCInstrDesc &Desc,
+                               const MCRegisterInfo &MRI,
+                               const AMDGPUMCInstrAnalysis &MIA) {
+  unsigned VgprMSBs = MIA.getVgprMSBs();
+  if (!VgprMSBs)
+    return Reg;
+
+  unsigned Enc = MRI.getEncodingValue(Reg);
+  if (!(Enc & AMDGPU::HWEncoding::IS_VGPR))
+    return Reg;
+
+  auto Ops = AMDGPU::getVGPRLoweringOperandTables(Desc);
+  if (!Ops.first)
+    return Reg;
+  unsigned Opc = Desc.getOpcode();
+  unsigned I;
+  for (I = 0; I < 4; ++I) {
+    if (Ops.first[I] != AMDGPU::OpName::NUM_OPERAND_NAMES &&
+        (unsigned)AMDGPU::getNamedOperandIdx(Opc, Ops.first[I]) == OpNo)
+      break;
+    if (Ops.second && Ops.second[I] != AMDGPU::OpName::NUM_OPERAND_NAMES &&
+        (unsigned)AMDGPU::getNamedOperandIdx(Opc, Ops.second[I]) == OpNo)
+      break;
+  }
+  if (I == 4)
+    return Reg;
+  unsigned OpMSBs = (VgprMSBs >> (I * 2)) & 3;
+  if (!OpMSBs)
+    return Reg;
+  if (MCRegister NewReg = AMDGPU::getVGPRWithMSBs(Reg, OpMSBs, MRI))
+    return NewReg;
+  return Reg;
+}
+
 void AMDGPUInstPrinter::printRegOperand(MCRegister Reg, raw_ostream &O,
                                         const MCRegisterInfo &MRI) {
 #if !defined(NDEBUG)
@@ -340,7 +385,20 @@ void AMDGPUInstPrinter::printRegOperand(MCRegister Reg, raw_ostream &O,
   }
 #endif
 
-  O << getRegisterName(Reg);
+  unsigned PrintReg = getRegForPrinting(Reg, MRI);
+  O << getRegisterName(PrintReg);
+
+  if (PrintReg != Reg.id())
+    O << " /*" << getRegisterName(Reg) << "*/";
+}
+
+void AMDGPUInstPrinter::printRegOperand(MCRegister Reg, unsigned Opc,
+                                        unsigned OpNo, raw_ostream &O,
+                                        const MCRegisterInfo &MRI) {
+  if (MIA)
+    Reg = getRegFromMIA(Reg, OpNo, MII.get(Opc), MRI,
+                        *static_cast<const AMDGPUMCInstrAnalysis *>(MIA));
+  printRegOperand(Reg, O, MRI);
 }
 
 void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo,
@@ -594,7 +652,7 @@ void AMDGPUInstPrinter::printImmediate32(uint32_t Imm,
   O << formatHex(static_cast<uint64_t>(Imm));
 }
 
-void AMDGPUInstPrinter::printImmediate64(uint64_t Imm,
+void AMDGPUInstPrinter::printImmediate64(const MCInstrDesc &Desc, uint64_t Imm,
                                          const MCSubtargetInfo &STI,
                                          raw_ostream &O, bool IsFP) {
   int64_t SImm = static_cast<int64_t>(Imm);
@@ -624,18 +682,24 @@ void AMDGPUInstPrinter::printImmediate64(uint64_t Imm,
   else if (Imm == 0x3fc45f306dc9c882 &&
            STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm))
     O << "0.15915494309189532";
-  else {
-    // This part needs to align with AMDGPUOperand::addLiteralImmOperand.
-    if (IsFP) {
-      if (STI.hasFeature(AMDGPU::Feature64BitLiterals) && Lo_32(Imm))
-        O << "lit64(" << formatHex(static_cast<uint64_t>(Imm)) << ')';
-      else
-        O << formatHex(static_cast<uint64_t>(Hi_32(Imm)));
-      return;
-    }
+  else
+    printLiteral64(Desc, Imm, STI, O, IsFP);
+}
 
-    if (STI.hasFeature(AMDGPU::Feature64BitLiterals) &&
-        (!isInt<32>(Imm) || !isUInt<32>(Imm)))
+void AMDGPUInstPrinter::printLiteral64(const MCInstrDesc &Desc, uint64_t Imm,
+                                       const MCSubtargetInfo &STI,
+                                       raw_ostream &O, bool IsFP) {
+  // This part needs to align with AMDGPUOperand::addLiteralImmOperand.
+  bool CanUse64BitLiterals =
+      STI.hasFeature(AMDGPU::Feature64BitLiterals) &&
+      !(Desc.TSFlags & (SIInstrFlags::VOP3 | SIInstrFlags::VOP3P));
+  if (IsFP) {
+    if (CanUse64BitLiterals && Lo_32(Imm))
+      O << "lit64(" << formatHex(static_cast<uint64_t>(Imm)) << ')';
+    else
+      O << formatHex(static_cast<uint64_t>(Hi_32(Imm)));
+  } else {
+    if (CanUse64BitLiterals && (!isInt<32>(Imm) || !isUInt<32>(Imm)))
       O << "lit64(" << formatHex(static_cast<uint64_t>(Imm)) << ')';
     else
       O << formatHex(static_cast<uint64_t>(Imm));
@@ -719,7 +783,7 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
 
   const MCOperand &Op = MI->getOperand(OpNo);
   if (Op.isReg()) {
-    printRegOperand(Op.getReg(), O, MRI);
+    printRegOperand(Op.getReg(), MI->getOpcode(), OpNo, O, MRI);
 
     // Check if operand register class contains register used.
     // Intention: print disassembler message when invalid code is decoded,
@@ -750,12 +814,12 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
       break;
     case AMDGPU::OPERAND_REG_IMM_INT64:
     case AMDGPU::OPERAND_REG_INLINE_C_INT64:
-      printImmediate64(Op.getImm(), STI, O, false);
+      printImmediate64(Desc, Op.getImm(), STI, O, false);
       break;
     case AMDGPU::OPERAND_REG_IMM_FP64:
     case AMDGPU::OPERAND_REG_INLINE_C_FP64:
     case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
-      printImmediate64(Op.getImm(), STI, O, true);
+      printImmediate64(Desc, Op.getImm(), STI, O, true);
       break;
     case AMDGPU::OPERAND_REG_INLINE_C_INT16:
     case AMDGPU::OPERAND_REG_IMM_INT16:
@@ -793,22 +857,6 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
       // custom printer.
       llvm_unreachable("unexpected immediate operand type");
     }
-  } else if (Op.isDFPImm()) {
-    double Value = bit_cast<double>(Op.getDFPImm());
-    // We special case 0.0 because otherwise it will be printed as an integer.
-    if (Value == 0.0)
-      O << "0.0";
-    else {
-      const MCInstrDesc &Desc = MII.get(MI->getOpcode());
-      int RCID = Desc.operands()[OpNo].RegClass;
-      unsigned RCBits = AMDGPU::getRegBitWidth(MRI.getRegClass(RCID));
-      if (RCBits == 32)
-        printImmediate32(llvm::bit_cast<uint32_t>((float)Value), STI, O);
-      else if (RCBits == 64)
-        printImmediate64(llvm::bit_cast<uint64_t>(Value), STI, O, true);
-      else
-        llvm_unreachable("Invalid register class size");
-    }
   } else if (Op.isExpr()) {
     const MCExpr *Exp = Op.getExpr();
     MAI.printExpr(O, *Exp);
@@ -891,7 +939,7 @@ void AMDGPUInstPrinter::printOperandAndFPInputMods(const MCInst *MI,
     if (OpNo + 1 < MI->getNumOperands() &&
         (InputModifiers & SISrcMods::ABS) == 0) {
       const MCOperand &Op = MI->getOperand(OpNo + 1);
-      NegMnemo = Op.isImm() || Op.isDFPImm();
+      NegMnemo = Op.isImm();
     }
     if (NegMnemo) {
       O << "neg(";
@@ -1146,7 +1194,7 @@ void AMDGPUInstPrinter::printExpSrcN(const MCInst *MI, unsigned OpNo,
     OpNo = OpNo - N + N / 2;
 
   if (En & (1 << N))
-    printRegOperand(MI->getOperand(OpNo).getReg(), O, MRI);
+    printRegOperand(MI->getOperand(OpNo).getReg(), Opc, OpNo, O, MRI);
   else
     O << "off";
 }
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index be32061c6453..21cc2f229de9 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -35,6 +35,8 @@ public:
                  const MCSubtargetInfo &STI, raw_ostream &O) override;
   static void printRegOperand(MCRegister Reg, raw_ostream &O,
                               const MCRegisterInfo &MRI);
+  void printRegOperand(MCRegister Reg, unsigned Opc, unsigned OpNo,
+                       raw_ostream &O, const MCRegisterInfo &MRI);
 
 private:
   void printU16ImmOperand(const MCInst *MI, unsigned OpNo,
@@ -70,7 +72,7 @@ private:
   void printSymbolicFormat(const MCInst *MI,
                            const MCSubtargetInfo &STI, raw_ostream &O);
 
-  void printRegOperand(unsigned RegNo, raw_ostream &O);
+  void printRegOperand(MCRegister Reg, raw_ostream &O);
   void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                    raw_ostream &O);
   void printVINTRPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
@@ -87,8 +89,10 @@ private:
                              raw_ostream &O);
   void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI,
                         raw_ostream &O);
-  void printImmediate64(uint64_t Imm, const MCSubtargetInfo &STI,
-                        raw_ostream &O, bool IsFP);
+  void printImmediate64(const MCInstrDesc &Desc, uint64_t Imm,
+                        const MCSubtargetInfo &STI, raw_ostream &O, bool IsFP);
+  void printLiteral64(const MCInstrDesc &Desc, uint64_t Imm,
+                      const MCSubtargetInfo &STI, raw_ostream &O, bool IsFP);
   void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                     raw_ostream &O);
   void printRegularOperand(const MCInst *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index 61f673221739..fd65f95334f7 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -88,7 +88,7 @@ private:
 
   /// Encode an fp or int literal.
   std::optional<uint64_t>
-  getLitEncoding(const MCOperand &MO, const MCOperandInfo &OpInfo,
+  getLitEncoding(const MCInstrDesc &Desc, const MCOperand &MO, unsigned OpNo,
                  const MCSubtargetInfo &STI,
                  bool HasMandatoryLiteral = false) const;
 
@@ -219,8 +219,8 @@ static uint32_t getLit16IntEncoding(uint32_t Val, const MCSubtargetInfo &STI) {
   return getLit32Encoding(Val, STI);
 }
 
-static uint32_t getLit64Encoding(uint64_t Val, const MCSubtargetInfo &STI,
-                                 bool IsFP) {
+static uint32_t getLit64Encoding(const MCInstrDesc &Desc, uint64_t Val,
+                                 const MCSubtargetInfo &STI, bool IsFP) {
   uint32_t IntImm = getIntInlineImmEncoding(static_cast<int64_t>(Val));
   if (IntImm != 0)
     return IntImm;
@@ -253,29 +253,27 @@ static uint32_t getLit64Encoding(uint64_t Val, const MCSubtargetInfo &STI,
       STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm))
     return 248;
 
-  // The rest part needs to align with AMDGPUInstPrinter::printImmediate64.
+  // The rest part needs to align with AMDGPUInstPrinter::printLiteral64.
 
+  bool CanUse64BitLiterals =
+      STI.hasFeature(AMDGPU::Feature64BitLiterals) &&
+      !(Desc.TSFlags & (SIInstrFlags::VOP3 | SIInstrFlags::VOP3P));
   if (IsFP) {
-    return STI.hasFeature(AMDGPU::Feature64BitLiterals) && Lo_32(Val) ? 254
-                                                                      : 255;
+    return CanUse64BitLiterals && Lo_32(Val) ? 254 : 255;
   }
 
-  return STI.hasFeature(AMDGPU::Feature64BitLiterals) &&
-                 (!isInt<32>(Val) || !isUInt<32>(Val))
-             ? 254
-             : 255;
+  return CanUse64BitLiterals && (!isInt<32>(Val) || !isUInt<32>(Val)) ? 254
+                                                                      : 255;
 }
 
 std::optional<uint64_t> AMDGPUMCCodeEmitter::getLitEncoding(
-    const MCOperand &MO, const MCOperandInfo &OpInfo,
+    const MCInstrDesc &Desc, const MCOperand &MO, unsigned OpNo,
     const MCSubtargetInfo &STI, bool HasMandatoryLiteral) const {
+  const MCOperandInfo &OpInfo = Desc.operands()[OpNo];
   int64_t Imm;
   if (MO.isExpr()) {
     if (!MO.getExpr()->evaluateAsAbsolute(Imm))
-      return (STI.hasFeature(AMDGPU::Feature64BitLiterals) &&
-              OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64)
-                 ? 254
-                 : 255;
+      return AMDGPU::getOperandSize(OpInfo) == 8 ? 254 : 255;
   } else {
     assert(!MO.isDFPImm());
 
@@ -299,14 +297,14 @@ std::optional<uint64_t> AMDGPUMCCodeEmitter::getLitEncoding(
 
   case AMDGPU::OPERAND_REG_IMM_INT64:
   case AMDGPU::OPERAND_REG_INLINE_C_INT64:
-    return getLit64Encoding(static_cast<uint64_t>(Imm), STI, false);
+    return getLit64Encoding(Desc, static_cast<uint64_t>(Imm), STI, false);
 
   case AMDGPU::OPERAND_REG_INLINE_C_FP64:
   case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
-    return getLit64Encoding(static_cast<uint64_t>(Imm), STI, true);
+    return getLit64Encoding(Desc, static_cast<uint64_t>(Imm), STI, true);
 
   case AMDGPU::OPERAND_REG_IMM_FP64: {
-    auto Enc = getLit64Encoding(static_cast<uint64_t>(Imm), STI, true);
+    auto Enc = getLit64Encoding(Desc, static_cast<uint64_t>(Imm), STI, true);
     return (HasMandatoryLiteral && Enc == 255) ? 254 : Enc;
   }
 
@@ -405,7 +403,7 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI,
   if (AMDGPU::isGFX10Plus(STI) && isVCMPX64(Desc)) {
     assert((Encoding & 0xFF) == 0);
     Encoding |= MRI.getEncodingValue(AMDGPU::EXEC_LO) &
-                AMDGPU::HWEncoding::REG_IDX_MASK;
+                AMDGPU::HWEncoding::LO256_REG_IDX_MASK;
   }
 
   for (unsigned i = 0; i < bytes; i++) {
@@ -447,7 +445,7 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI,
 
     // Is this operand a literal immediate?
     const MCOperand &Op = MI.getOperand(i);
-    auto Enc = getLitEncoding(Op, Desc.operands()[i], STI);
+    auto Enc = getLitEncoding(Desc, Op, i, STI);
     if (!Enc || (*Enc != 255 && *Enc != 254))
       continue;
 
@@ -521,7 +519,7 @@ void AMDGPUMCCodeEmitter::getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
     return;
   } else {
     const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
-    auto Enc = getLitEncoding(MO, Desc.operands()[OpNo], STI);
+    auto Enc = getLitEncoding(Desc, MO, OpNo, STI);
     if (Enc && *Enc != 255) {
       Op = *Enc | SDWA9EncValues::SRC_SGPR_MASK;
       return;
@@ -554,7 +552,7 @@ void AMDGPUMCCodeEmitter::getAVOperandEncoding(
     SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const {
   MCRegister Reg = MI.getOperand(OpNo).getReg();
   unsigned Enc = MRI.getEncodingValue(Reg);
-  unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK;
+  unsigned Idx = Enc & AMDGPU::HWEncoding::LO256_REG_IDX_MASK;
   bool IsVGPROrAGPR =
       Enc & (AMDGPU::HWEncoding::IS_VGPR | AMDGPU::HWEncoding::IS_AGPR);
 
@@ -596,7 +594,7 @@ void AMDGPUMCCodeEmitter::getMachineOpValue(const MCInst &MI,
                                             const MCSubtargetInfo &STI) const {
   if (MO.isReg()){
     unsigned Enc = MRI.getEncodingValue(MO.getReg());
-    unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK;
+    unsigned Idx = Enc & AMDGPU::HWEncoding::LO256_REG_IDX_MASK;
     bool IsVGPROrAGPR =
         Enc & (AMDGPU::HWEncoding::IS_VGPR | AMDGPU::HWEncoding::IS_AGPR);
     Op = Idx | (IsVGPROrAGPR << 8);
@@ -659,7 +657,7 @@ void AMDGPUMCCodeEmitter::getMachineOpValueT16Lo128(
   const MCOperand &MO = MI.getOperand(OpNo);
   if (MO.isReg()) {
     uint16_t Encoding = MRI.getEncodingValue(MO.getReg());
-    unsigned RegIdx = Encoding & AMDGPU::HWEncoding::REG_IDX_MASK;
+    unsigned RegIdx = Encoding & AMDGPU::HWEncoding::LO256_REG_IDX_MASK;
     bool IsHi = Encoding & AMDGPU::HWEncoding::IS_HI16;
     bool IsVGPR = Encoding & AMDGPU::HWEncoding::IS_VGPR;
     assert((!IsVGPR || isUInt<7>(RegIdx)) && "VGPR0-VGPR127 expected!");
@@ -695,11 +693,8 @@ void AMDGPUMCCodeEmitter::getMachineOpValueCommon(
     const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
     uint32_t Offset = Desc.getSize();
     assert(Offset == 4 || Offset == 8);
-    auto OpType = Desc.operands()[OpNo].OperandType;
-    MCFixupKind Kind = (STI.hasFeature(AMDGPU::Feature64BitLiterals) &&
-                        OpType == AMDGPU::OPERAND_REG_IMM_INT64)
-                           ? FK_Data_8
-                           : FK_Data_4;
+    unsigned Size = AMDGPU::getOperandSize(Desc, OpNo);
+    MCFixupKind Kind = MCFixup::getDataKindForSize(Size);
     addFixup(Fixups, Offset, MO.getExpr(), Kind, PCRel);
   }
 
@@ -707,8 +702,7 @@ void AMDGPUMCCodeEmitter::getMachineOpValueCommon(
   if (AMDGPU::isSISrcOperand(Desc, OpNo)) {
     bool HasMandatoryLiteral =
         AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::imm);
-    if (auto Enc = getLitEncoding(MO, Desc.operands()[OpNo], STI,
-                                  HasMandatoryLiteral)) {
+    if (auto Enc = getLitEncoding(Desc, MO, OpNo, STI, HasMandatoryLiteral)) {
       Op = *Enc;
       return;
     }
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index d66725d3a6c4..90c56f690146 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -21,9 +21,9 @@
 #include "TargetInfo/AMDGPUTargetInfo.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCInstPrinter.h"
-#include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
@@ -130,31 +130,35 @@ static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
                                  std::move(Emitter));
 }
 
-namespace {
-
-class AMDGPUMCInstrAnalysis : public MCInstrAnalysis {
-public:
-  explicit AMDGPUMCInstrAnalysis(const MCInstrInfo *Info)
-      : MCInstrAnalysis(Info) {}
-
-  bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size,
-                      uint64_t &Target) const override {
-    if (Inst.getNumOperands() == 0 || !Inst.getOperand(0).isImm() ||
-        Info->get(Inst.getOpcode()).operands()[0].OperandType !=
-            MCOI::OPERAND_PCREL)
-      return false;
+namespace llvm {
+namespace AMDGPU {
+
+bool AMDGPUMCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr,
+                                           uint64_t Size,
+                                           uint64_t &Target) const {
+  if (Inst.getNumOperands() == 0 || !Inst.getOperand(0).isImm() ||
+      Info->get(Inst.getOpcode()).operands()[0].OperandType !=
+          MCOI::OPERAND_PCREL)
+    return false;
+
+  int64_t Imm = Inst.getOperand(0).getImm();
+  // Our branches take a simm16.
+  Target = SignExtend64<16>(Imm) * 4 + Addr + Size;
+  return true;
+}
 
-    int64_t Imm = Inst.getOperand(0).getImm();
-    // Our branches take a simm16.
-    Target = SignExtend64<16>(Imm) * 4 + Addr + Size;
-    return true;
-  }
-};
+void AMDGPUMCInstrAnalysis::updateState(const MCInst &Inst, uint64_t Addr) {
+  if (Inst.getOpcode() == AMDGPU::S_SET_VGPR_MSB_gfx12)
+    VgprMSBs = Inst.getOperand(0).getImm();
+  else if (isTerminator(Inst))
+    VgprMSBs = 0;
+}
 
-} // end anonymous namespace
+} // end namespace AMDGPU
+} // end namespace llvm
 
 static MCInstrAnalysis *createAMDGPUMCInstrAnalysis(const MCInstrInfo *Info) {
-  return new AMDGPUMCInstrAnalysis(Info);
+  return new AMDGPU::AMDGPUMCInstrAnalysis(Info);
 }
 
 extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
index 9c0b2da0fcb0..986388414096 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCTARGETDESC_H
 #define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCTARGETDESC_H
 
+#include "llvm/MC/MCInstrAnalysis.h"
 #include <cstdint>
 #include <memory>
 
@@ -44,6 +45,28 @@ MCAsmBackend *createAMDGPUAsmBackend(const Target &T,
 std::unique_ptr<MCObjectTargetWriter>
 createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI,
                             bool HasRelocationAddend);
+
+namespace AMDGPU {
+class AMDGPUMCInstrAnalysis : public MCInstrAnalysis {
+private:
+  unsigned VgprMSBs = 0;
+
+public:
+  explicit AMDGPUMCInstrAnalysis(const MCInstrInfo *Info)
+      : MCInstrAnalysis(Info) {}
+
+  bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size,
+                      uint64_t &Target) const override;
+
+  void resetState() override { VgprMSBs = 0; }
+
+  void updateState(const MCInst &Inst, uint64_t Addr) override;
+
+  unsigned getVgprMSBs() const { return VgprMSBs; }
+};
+
+} // namespace AMDGPU
+
 } // namespace llvm
 
 #define GET_REGINFO_ENUM
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 0bbab29dbda1..ff6a21239345 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -448,11 +448,6 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
       amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT,
       amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE,
       ".amdhsa_user_sgpr_private_segment_size");
-  if (isGFX1250(STI))
-    PrintField(KD.kernel_code_properties,
-               amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES_SHIFT,
-               amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES,
-               ".amdhsa_uses_cu_stores");
   if (IVersion.Major >= 10)
     PrintField(KD.kernel_code_properties,
                amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_SHIFT,
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index ff5321df6452..bf787b230067 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -420,7 +420,7 @@ class VSAMPLE_gfx12<int op, dag outs, int num_addrs, string dns="",
 }
 
 class MIMG_NoSampler_Helper <mimgopc op, string asm,
-                             RegisterClass dst_rc,
+                             RegisterOperand dst_rc,
                              RegisterClass addr_rc,
                              string dns="">
   : MIMG_gfx6789 <op.GFX10M, (outs dst_rc:$vdata), dns> {
@@ -433,10 +433,10 @@ class MIMG_NoSampler_Helper <mimgopc op, string asm,
 }
 
 class MIMG_NoSampler_Helper_gfx90a <mimgopc op, string asm,
-                                    RegisterClass dst_rc,
+                                    RegisterOperand dst_rc,
                                     RegisterClass addr_rc,
                                     string dns="">
-  : MIMG_gfx90a <op.GFX10M, (outs getLdStRegisterOperand<dst_rc>.ret:$vdata), dns> {
+  : MIMG_gfx90a <op.GFX10M, (outs getAlign2RegOp<dst_rc>.ret:$vdata), dns> {
   let InOperandList = !con((ins addr_rc:$vaddr, SReg_256_XNULL:$srsrc,
                                 DMask:$dmask, UNorm:$unorm, CPol:$cpol,
                                 R128A16:$r128, LWE:$lwe, DA:$da),
@@ -446,7 +446,7 @@ class MIMG_NoSampler_Helper_gfx90a <mimgopc op, string asm,
 }
 
 class MIMG_NoSampler_gfx10<mimgopc op, string opcode,
-                           RegisterClass DataRC, RegisterClass AddrRC,
+                           RegisterOperand DataRC, RegisterClass AddrRC,
                            string dns="">
   : MIMG_gfx10<op.GFX10M, (outs DataRC:$vdata), dns> {
   let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask,
@@ -458,7 +458,7 @@ class MIMG_NoSampler_gfx10<mimgopc op, string opcode,
 }
 
 class MIMG_NoSampler_nsa_gfx10<mimgopc op, string opcode,
-                               RegisterClass DataRC, int num_addrs,
+                               RegisterOperand DataRC, int num_addrs,
                                string dns="">
   : MIMG_nsa_gfx10<op.GFX10M, (outs DataRC:$vdata), num_addrs, dns> {
   let InOperandList = !con(AddrIns,
@@ -471,7 +471,7 @@ class MIMG_NoSampler_nsa_gfx10<mimgopc op, string opcode,
 }
 
 class MIMG_NoSampler_gfx11<mimgopc op, string opcode,
-                           RegisterClass DataRC, RegisterClass AddrRC,
+                           RegisterOperand DataRC, RegisterClass AddrRC,
                            string dns="">
   : MIMG_gfx11<op.GFX11, (outs DataRC:$vdata), dns> {
   let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask,
@@ -483,7 +483,7 @@ class MIMG_NoSampler_gfx11<mimgopc op, string opcode,
 }
 
 class MIMG_NoSampler_nsa_gfx11<mimgopc op, string opcode,
-                               RegisterClass DataRC, int num_addrs,
+                               RegisterOperand DataRC, int num_addrs,
                                string dns="">
   : MIMG_nsa_gfx11<op.GFX11, (outs DataRC:$vdata), num_addrs, dns> {
   let InOperandList = !con(AddrIns,
@@ -496,7 +496,7 @@ class MIMG_NoSampler_nsa_gfx11<mimgopc op, string opcode,
 }
 
 class VIMAGE_NoSampler_gfx12<mimgopc op, string opcode,
-                             RegisterClass DataRC, int num_addrs,
+                             RegisterOperand DataRC, int num_addrs,
                              string dns="">
   : VIMAGE_gfx12<op.GFX12, (outs DataRC:$vdata), num_addrs, dns> {
   let InOperandList = !con(AddrIns,
@@ -507,7 +507,7 @@ class VIMAGE_NoSampler_gfx12<mimgopc op, string opcode,
                     #!if(BaseOpcode.HasD16, "$d16", "");
 }
 
-class VSAMPLE_Sampler_gfx12<mimgopc op, string opcode, RegisterClass DataRC,
+class VSAMPLE_Sampler_gfx12<mimgopc op, string opcode, RegisterOperand DataRC,
                             int num_addrs, RegisterClass Addr3RC = VGPR_32,
                             string dns="">
   : VSAMPLE_gfx12<op.GFX12, (outs DataRC:$vdata), num_addrs, dns, Addr3RC> {
@@ -544,7 +544,7 @@ class VSAMPLE_Sampler_nortn_gfx12<mimgopc op, string opcode,
 }
 
 multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm,
-                                      RegisterClass dst_rc, bit enableDisasm,
+                                      RegisterOperand dst_rc, bit enableDisasm,
                                       bit ExtendedImageInst = 1,
                                       bit isVSample = 0> {
   let VAddrDwords = 1 in {
@@ -578,7 +578,7 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm,
       if op.HAS_GFX10M then {
         def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>;
         if !not(ExtendedImageInst) then
-        def _V2_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_64>;
+        def _V2_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_64_Align2>;
         def _V2_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_64>;
         def _V2_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 2>;
       }
@@ -602,7 +602,7 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm,
       if op.HAS_GFX10M then {
         def _V3 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_96>;
         if !not(ExtendedImageInst) then
-        def _V3_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_96>;
+        def _V3_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_96_Align2>;
         def _V3_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_96>;
         def _V3_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 3>;
       }
@@ -626,7 +626,7 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm,
       if op.HAS_GFX10M then {
         def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>;
         if !not(ExtendedImageInst) then
-        def _V4_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_128>;
+        def _V4_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_128_Align2>;
         def _V4_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_128>;
         def _V4_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 4,
                                                      !if(enableDisasm, "GFX10", "")>;
@@ -664,20 +664,20 @@ multiclass MIMG_NoSampler <mimgopc op, string asm, bit has_d16, bit mip = 0,
   let BaseOpcode = !cast<MIMGBaseOpcode>(NAME),
       mayLoad = !not(isResInfo) in {
     let VDataDwords = 1 in
-    defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1, msaa>;
+    defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, AVLdSt_32, 1, msaa>;
     let VDataDwords = 2 in
-    defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 0, msaa>;
+    defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, AVLdSt_64, 0, msaa>;
     let VDataDwords = 3 in
-    defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 0, msaa>;
+    defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, AVLdSt_96, 0, msaa>;
     let VDataDwords = 4 in
-    defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 0, msaa>;
+    defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, AVLdSt_128, 0, msaa>;
     let VDataDwords = 5 in
-    defm _V5 : MIMG_NoSampler_Src_Helper <op, asm, VReg_160, 0, msaa>;
+    defm _V5 : MIMG_NoSampler_Src_Helper <op, asm, AVLdSt_160, 0, msaa>;
   }
 }
 
 class MIMG_Store_Helper <mimgopc op, string asm,
-                         RegisterClass data_rc,
+                         RegisterOperand data_rc,
                          RegisterClass addr_rc,
                          string dns = "">
   : MIMG_gfx6789<op.GFX10M, (outs), dns> {
@@ -690,11 +690,11 @@ class MIMG_Store_Helper <mimgopc op, string asm,
 }
 
 class MIMG_Store_Helper_gfx90a <mimgopc op, string asm,
-                                RegisterClass data_rc,
+                                RegisterOperand data_rc,
                                 RegisterClass addr_rc,
                                 string dns = "">
   : MIMG_gfx90a<op.GFX10M, (outs), dns> {
-  let InOperandList = !con((ins getLdStRegisterOperand<data_rc>.ret:$vdata,
+  let InOperandList = !con((ins getAlign2RegOp<data_rc>.ret:$vdata,
                                 addr_rc:$vaddr, SReg_256_XNULL:$srsrc,
                                 DMask:$dmask, UNorm:$unorm, CPol:$cpol,
                                 R128A16:$r128, LWE:$lwe, DA:$da),
@@ -704,7 +704,7 @@ class MIMG_Store_Helper_gfx90a <mimgopc op, string asm,
 }
 
 class MIMG_Store_gfx10<mimgopc op, string opcode,
-                       RegisterClass DataRC, RegisterClass AddrRC,
+                       RegisterOperand DataRC, RegisterClass AddrRC,
                        string dns="">
   : MIMG_gfx10<op.GFX10M, (outs), dns> {
   let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc,
@@ -716,7 +716,7 @@ class MIMG_Store_gfx10<mimgopc op, string opcode,
 }
 
 class MIMG_Store_nsa_gfx10<mimgopc op, string opcode,
-                           RegisterClass DataRC, int num_addrs,
+                           RegisterOperand DataRC, int num_addrs,
                            string dns="">
   : MIMG_nsa_gfx10<op.GFX10M, (outs), num_addrs, dns> {
   let InOperandList = !con((ins DataRC:$vdata),
@@ -730,7 +730,7 @@ class MIMG_Store_nsa_gfx10<mimgopc op, string opcode,
 }
 
 class MIMG_Store_gfx11<mimgopc op, string opcode,
-                       RegisterClass DataRC, RegisterClass AddrRC,
+                       RegisterOperand DataRC, RegisterClass AddrRC,
                        string dns="">
   : MIMG_gfx11<op.GFX11, (outs), dns> {
   let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc,
@@ -742,7 +742,7 @@ class MIMG_Store_gfx11<mimgopc op, string opcode,
 }
 
 class MIMG_Store_nsa_gfx11<mimgopc op, string opcode,
-                           RegisterClass DataRC, int num_addrs,
+                           RegisterOperand DataRC, int num_addrs,
                            string dns="">
   : MIMG_nsa_gfx11<op.GFX11, (outs), num_addrs, dns> {
   let InOperandList = !con((ins DataRC:$vdata),
@@ -756,7 +756,7 @@ class MIMG_Store_nsa_gfx11<mimgopc op, string opcode,
 }
 
 class VIMAGE_Store_gfx12<mimgopc op, string opcode,
-                         RegisterClass DataRC, int num_addrs,
+                         RegisterOperand DataRC, int num_addrs,
                          string dns="">
   : VIMAGE_gfx12<op.GFX12, (outs), num_addrs, dns> {
   let InOperandList = !con((ins DataRC:$vdata),
@@ -769,7 +769,7 @@ class VIMAGE_Store_gfx12<mimgopc op, string opcode,
 }
 
 multiclass MIMG_Store_Addr_Helper <mimgopc op, string asm,
-                                  RegisterClass data_rc,
+                                  RegisterOperand data_rc,
                                   bit enableDisasm> {
   let mayLoad = 0, mayStore = 1, hasSideEffects = 0, hasPostISelHook = 0,
       DisableWQM = 1 in {
@@ -797,7 +797,7 @@ multiclass MIMG_Store_Addr_Helper <mimgopc op, string asm,
       let ssamp = 0 in {
         if op.HAS_GFX10M then {
           def _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>;
-          def _V2_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_64>;
+          def _V2_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_64_Align2>;
           def _V2_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_64>;
           def _V2_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 2>;
         }
@@ -814,7 +814,7 @@ multiclass MIMG_Store_Addr_Helper <mimgopc op, string asm,
       let ssamp = 0 in {
         if op.HAS_GFX10M then {
           def _V3 : MIMG_Store_Helper <op, asm, data_rc, VReg_96>;
-          def _V3_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_96>;
+          def _V3_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_96_Align2>;
           def _V3_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_96>;
           def _V3_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 3>;
         }
@@ -831,7 +831,7 @@ multiclass MIMG_Store_Addr_Helper <mimgopc op, string asm,
       let ssamp = 0 in {
         if op.HAS_GFX10M then {
           def _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>;
-          def _V4_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_128>;
+          def _V4_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_128_Align2>;
           def _V4_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_128>;
           def _V4_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 4,
                                                           !if(enableDisasm, "GFX10", "")>;
@@ -860,19 +860,19 @@ multiclass MIMG_Store <mimgopc op, string asm, bit has_d16, bit mip = 0> {
 
   let BaseOpcode = !cast<MIMGBaseOpcode>(NAME) in {
     let VDataDwords = 1 in
-    defm _V1 : MIMG_Store_Addr_Helper <op, asm, VGPR_32, 1>;
+    defm _V1 : MIMG_Store_Addr_Helper <op, asm, AVLdSt_32, 1>;
     let VDataDwords = 2 in
-    defm _V2 : MIMG_Store_Addr_Helper <op, asm, VReg_64, 0>;
+    defm _V2 : MIMG_Store_Addr_Helper <op, asm, AVLdSt_64, 0>;
     let VDataDwords = 3 in
-    defm _V3 : MIMG_Store_Addr_Helper <op, asm, VReg_96, 0>;
+    defm _V3 : MIMG_Store_Addr_Helper <op, asm, AVLdSt_96, 0>;
     let VDataDwords = 4 in
-    defm _V4 : MIMG_Store_Addr_Helper <op, asm, VReg_128, 0>;
+    defm _V4 : MIMG_Store_Addr_Helper <op, asm, AVLdSt_128, 0>;
     let VDataDwords = 5 in
-    defm _V5 : MIMG_Store_Addr_Helper <op, asm, VReg_160, 0>;
+    defm _V5 : MIMG_Store_Addr_Helper <op, asm, AVLdSt_160, 0>;
   }
 }
 
-class MIMG_Atomic_gfx6789_base <bits<8> op, string asm, RegisterClass data_rc,
+class MIMG_Atomic_gfx6789_base <bits<8> op, string asm, RegisterOperand data_rc,
                                 RegisterClass addr_rc, string dns="">
   : MIMG_gfx6789 <op, (outs data_rc:$vdst), dns> {
   let Constraints = "$vdst = $vdata";
@@ -883,33 +883,33 @@ class MIMG_Atomic_gfx6789_base <bits<8> op, string asm, RegisterClass data_rc,
   let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da";
 }
 
-class MIMG_Atomic_gfx90a_base <bits<8> op, string asm, RegisterClass data_rc,
+class MIMG_Atomic_gfx90a_base <bits<8> op, string asm, RegisterOperand data_rc,
                                RegisterClass addr_rc, string dns="">
-  : MIMG_gfx90a <op, (outs getLdStRegisterOperand<data_rc>.ret:$vdst), dns> {
+  : MIMG_gfx90a <op, (outs getAlign2RegOp<data_rc>.ret:$vdst), dns> {
   let Constraints = "$vdst = $vdata";
 
-  let InOperandList = (ins getLdStRegisterOperand<data_rc>.ret:$vdata,
+  let InOperandList = (ins getAlign2RegOp<data_rc>.ret:$vdata,
                            addr_rc:$vaddr, SReg_256_XNULL:$srsrc,
                            DMask:$dmask, UNorm:$unorm, CPol:$cpol,
                            R128A16:$r128, LWE:$lwe, DA:$da);
   let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da";
 }
 
-class MIMG_Atomic_si<mimgopc op, string asm, RegisterClass data_rc,
+class MIMG_Atomic_si<mimgopc op, string asm, RegisterOperand data_rc,
                      RegisterClass addr_rc, bit enableDasm = 0>
   : MIMG_Atomic_gfx6789_base<op.SI, asm, data_rc, addr_rc,
                              !if(enableDasm, "GFX6GFX7", "")> {
   let AssemblerPredicate = isGFX6GFX7;
 }
 
-class MIMG_Atomic_vi<mimgopc op, string asm, RegisterClass data_rc,
+class MIMG_Atomic_vi<mimgopc op, string asm, RegisterOperand data_rc,
                      RegisterClass addr_rc, bit enableDasm = 0>
   : MIMG_Atomic_gfx6789_base<op.VI, asm, data_rc, addr_rc, !if(enableDasm, "GFX8", "")> {
   let AssemblerPredicate = isGFX8GFX9NotGFX90A;
   let MIMGEncoding = MIMGEncGfx8;
 }
 
-class MIMG_Atomic_gfx90a<mimgopc op, string asm, RegisterClass data_rc,
+class MIMG_Atomic_gfx90a<mimgopc op, string asm, RegisterOperand data_rc,
                          RegisterClass addr_rc, bit enableDasm = 0>
   : MIMG_Atomic_gfx90a_base<op.VI, asm, data_rc, addr_rc, !if(enableDasm, "GFX90A", "")> {
   let AssemblerPredicate = isGFX90APlus;
@@ -917,7 +917,7 @@ class MIMG_Atomic_gfx90a<mimgopc op, string asm, RegisterClass data_rc,
 }
 
 class MIMG_Atomic_gfx10<mimgopc op, string opcode,
-                        RegisterClass DataRC, RegisterClass AddrRC,
+                        RegisterOperand DataRC, RegisterClass AddrRC,
                         bit enableDisasm = 0>
   : MIMG_gfx10<op.GFX10M, (outs DataRC:$vdst),
                !if(enableDisasm, "GFX10", "")> {
@@ -930,7 +930,7 @@ class MIMG_Atomic_gfx10<mimgopc op, string opcode,
 }
 
 class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode,
-                            RegisterClass DataRC, int num_addrs,
+                            RegisterOperand DataRC, int num_addrs,
                             bit enableDisasm = 0>
   : MIMG_nsa_gfx10<op.GFX10M, (outs DataRC:$vdst), num_addrs,
                    !if(enableDisasm, "GFX10", "")> {
@@ -945,7 +945,7 @@ class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode,
 }
 
 class MIMG_Atomic_gfx11<mimgopc op, string opcode,
-                        RegisterClass DataRC, RegisterClass AddrRC,
+                        RegisterOperand DataRC, RegisterClass AddrRC,
                         bit enableDisasm = 0>
   : MIMG_gfx11<op.GFX11, (outs DataRC:$vdst),
                !if(enableDisasm, "GFX11", "")> {
@@ -958,7 +958,7 @@ class MIMG_Atomic_gfx11<mimgopc op, string opcode,
 }
 
 class MIMG_Atomic_nsa_gfx11<mimgopc op, string opcode,
-                            RegisterClass DataRC, int num_addrs,
+                            RegisterOperand DataRC, int num_addrs,
                             bit enableDisasm = 0>
   : MIMG_nsa_gfx11<op.GFX11, (outs DataRC:$vdst), num_addrs,
                    !if(enableDisasm, "GFX11", "")> {
@@ -972,7 +972,7 @@ class MIMG_Atomic_nsa_gfx11<mimgopc op, string opcode,
   let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe";
 }
 
-class VIMAGE_Atomic_gfx12<mimgopc op, string opcode, RegisterClass DataRC,
+class VIMAGE_Atomic_gfx12<mimgopc op, string opcode, RegisterOperand DataRC,
                           int num_addrs, string renamed, bit enableDisasm = 0>
   : VIMAGE_gfx12<op.GFX12, (outs DataRC:$vdst), num_addrs,
                   !if(enableDisasm, "GFX12", "")> {
@@ -987,7 +987,7 @@ class VIMAGE_Atomic_gfx12<mimgopc op, string opcode, RegisterClass DataRC,
 }
 
 multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
-                                      RegisterClass data_rc,
+                                      RegisterOperand data_rc,
                                       bit enableDasm = 0,
                                       bit isFP = 0,
                                       string renamed = ""> {
@@ -1022,7 +1022,7 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
         }
         if op.HAS_VI then {
           def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, 0>;
-          def _V2_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_64, 0>;
+          def _V2_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_64_Align2, 0>;
         }
         if op.HAS_GFX10M then {
           def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, 0>;
@@ -1044,7 +1044,7 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
         }
         if op.HAS_VI then {
           def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, 0>;
-          def _V3_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_96, 0>;
+          def _V3_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_96_Align2, 0>;
         }
         if op.HAS_GFX10M then {
           def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, 0>;
@@ -1066,7 +1066,7 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
         }
         if op.HAS_VI then {
           def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, 0>;
-          def _V4_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_128, 0>;
+          def _V4_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_128_Align2, 0>;
         }
         if op.HAS_GFX10M then {
           def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, 0>;
@@ -1105,19 +1105,19 @@ multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0,
       // Other variants are reconstructed by disassembler using dmask and tfe.
       if !not(isCmpSwap) then {
         let VDataDwords = 1 in
-        defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, VGPR_32, 1, isFP, renamed>;
+        defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_32, 1, isFP, renamed>;
       }
 
       let VDataDwords = 2 in
-      defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, VReg_64, isCmpSwap, isFP, renamed>;
+      defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_64, isCmpSwap, isFP, renamed>;
       let VDataDwords = 3 in
-      defm _V3 : MIMG_Atomic_Addr_Helper_m <op, asm, VReg_96, 0, isFP, renamed>;
+      defm _V3 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_96, 0, isFP, renamed>;
 
       if isCmpSwap then {
         let VDataDwords = 4 in
-        defm _V4 : MIMG_Atomic_Addr_Helper_m <op, asm, VReg_128, 0, isFP, renamed>;
+        defm _V4 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_128, 0, isFP, renamed>;
         let VDataDwords = 5 in
-        defm _V5 : MIMG_Atomic_Addr_Helper_m <op, asm, VReg_160, 0, isFP, renamed>;
+        defm _V5 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_160, 0, isFP, renamed>;
       }
     }
   } // End IsAtomicRet = 1
@@ -1127,7 +1127,7 @@ multiclass MIMG_Atomic_Renamed <mimgopc op, string asm, string renamed,
                                 bit isCmpSwap = 0, bit isFP = 0>
   : MIMG_Atomic <op, asm, isCmpSwap, isFP, renamed>;
 
-class MIMG_Sampler_Helper <mimgopc op, string asm, RegisterClass dst_rc,
+class MIMG_Sampler_Helper <mimgopc op, string asm, RegisterOperand dst_rc,
                            RegisterClass src_rc, string dns="">
   : MIMG_gfx6789 <op.VI, (outs dst_rc:$vdata), dns> {
   let InOperandList = !con((ins src_rc:$vaddr, SReg_256_XNULL:$srsrc, SReg_128_XNULL:$ssamp,
@@ -1138,9 +1138,9 @@ class MIMG_Sampler_Helper <mimgopc op, string asm, RegisterClass dst_rc,
                       #!if(BaseOpcode.HasD16, "$d16", "");
 }
 
-class MIMG_Sampler_gfx90a<mimgopc op, string asm, RegisterClass dst_rc,
+class MIMG_Sampler_gfx90a<mimgopc op, string asm, RegisterOperand dst_rc,
                           RegisterClass src_rc, string dns="">
-  : MIMG_gfx90a<op.GFX10M, (outs getLdStRegisterOperand<dst_rc>.ret:$vdata), dns> {
+  : MIMG_gfx90a<op.GFX10M, (outs dst_rc:$vdata), dns> {
   let InOperandList = !con((ins src_rc:$vaddr, SReg_256_XNULL:$srsrc, SReg_128_XNULL:$ssamp,
                                 DMask:$dmask, UNorm:$unorm, CPol:$cpol,
                                 R128A16:$r128, LWE:$lwe, DA:$da),
@@ -1164,7 +1164,7 @@ class MIMG_Sampler_Asm_gfx10p<string opcode, string AsmPrefix, bit HasD16> {
 }
 
 class MIMG_Sampler_gfx10<mimgopc op, string opcode,
-                         RegisterClass DataRC, RegisterClass AddrRC,
+                         RegisterOperand DataRC, RegisterClass AddrRC,
                          string dns="">
   : MIMG_gfx10<op.GFX10M, (outs DataRC:$vdata), dns> {
   let InOperandList = MIMG_Sampler_OpList_gfx10p<(ins AddrRC:$vaddr0), BaseOpcode.HasD16>.ret;
@@ -1172,7 +1172,7 @@ class MIMG_Sampler_gfx10<mimgopc op, string opcode,
 }
 
 class MIMG_Sampler_nsa_gfx10<mimgopc op, string opcode,
-                             RegisterClass DataRC, int num_addrs,
+                             RegisterOperand DataRC, int num_addrs,
                              string dns="">
   : MIMG_nsa_gfx10<op.GFX10M, (outs DataRC:$vdata), num_addrs, dns> {
   let InOperandList = MIMG_Sampler_OpList_gfx10p<AddrIns, BaseOpcode.HasD16>.ret;
@@ -1200,7 +1200,7 @@ class MIMG_Sampler_nortn_nsa_gfx10<mimgopc op, string opcode,
 }
 
 class MIMG_Sampler_gfx11<mimgopc op, string opcode,
-                         RegisterClass DataRC, RegisterClass AddrRC,
+                         RegisterOperand DataRC, RegisterClass AddrRC,
                          string dns="">
   : MIMG_gfx11<op.GFX11, (outs DataRC:$vdata), dns> {
   let InOperandList = MIMG_Sampler_OpList_gfx10p<(ins AddrRC:$vaddr0), BaseOpcode.HasD16>.ret;
@@ -1208,7 +1208,7 @@ class MIMG_Sampler_gfx11<mimgopc op, string opcode,
 }
 
 class MIMG_Sampler_nsa_gfx11<mimgopc op, string opcode,
-                             RegisterClass DataRC, int num_addrs,
+                             RegisterOperand DataRC, int num_addrs,
                              RegisterClass LastVAddrSize, string dns="">
   : MIMG_nsa_gfx11<op.GFX11, (outs DataRC:$vdata), num_addrs, dns, [],
                    LastVAddrSize> {
@@ -1345,7 +1345,7 @@ class MIMG_Sampler_AddrSizes<AMDGPUSampleVariant sample, bit isG16,
 }
 
 multiclass MIMG_Sampler_Src_Helper <mimgopc op, string asm,
-                                    AMDGPUSampleVariant sample, RegisterClass dst_rc,
+                                    AMDGPUSampleVariant sample, RegisterOperand dst_rc,
                                     bit enableDisasm = 0,
                                     bit ExtendedImageInst = 1, bit isG16 = 0> {
   foreach addr = MIMG_Sampler_AddrSizes<sample, isG16>.MachineInstrs in {
@@ -1473,15 +1473,15 @@ multiclass MIMG_Sampler <mimgopc op, AMDGPUSampleVariant sample, bit isPointSamp
   let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
       mayLoad = !not(isGetLod) in {
     let VDataDwords = 1 in
-    defm _V1 : MIMG_Sampler_Src_Helper<op, asm, sample, VGPR_32, 1, ExtendedImageInst, isG16>;
+    defm _V1 : MIMG_Sampler_Src_Helper<op, asm, sample, AVLdSt_32, 1, ExtendedImageInst, isG16>;
     let VDataDwords = 2 in
-    defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64, 0, ExtendedImageInst, isG16>;
+    defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, AVLdSt_64, 0, ExtendedImageInst, isG16>;
     let VDataDwords = 3 in
-    defm _V3 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_96, 0, ExtendedImageInst, isG16>;
+    defm _V3 : MIMG_Sampler_Src_Helper<op, asm, sample, AVLdSt_96, 0, ExtendedImageInst, isG16>;
     let VDataDwords = 4 in
-    defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128, 0, ExtendedImageInst, isG16>;
+    defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, AVLdSt_128, 0, ExtendedImageInst, isG16>;
     let VDataDwords = 5 in
-    defm _V5 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_160, 0, ExtendedImageInst, isG16>;
+    defm _V5 : MIMG_Sampler_Src_Helper<op, asm, sample, AVLdSt_160, 0, ExtendedImageInst, isG16>;
   }
 
   if !not(isGetLod) then
@@ -1501,11 +1501,11 @@ multiclass MIMG_Gather <mimgopc op, AMDGPUSampleVariant sample, bit wqm = 0,
   let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
       Gather4 = 1 in {
     let VDataDwords = 2 in
-    defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64, /*enableDisasm*/ true>; /* for packed D16 only */
+    defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, AVLdSt_64, /*enableDisasm*/ true>; /* for packed D16 only */
     let VDataDwords = 4 in
-    defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128>;
+    defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, AVLdSt_128>;
     let VDataDwords = 5 in
-    defm _V5 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_160>;
+    defm _V5 : MIMG_Sampler_Src_Helper<op, asm, sample, AVLdSt_160>;
   }
 }
 
@@ -1632,13 +1632,13 @@ multiclass MIMG_MSAA_Load <mimgopc op, string asm> {
   let BaseOpcode = !cast<MIMGBaseOpcode>(NAME),
     Gather4 = 1, hasPostISelHook = 0, mayLoad = 1 in {
     let VDataDwords = 2 in
-    defm _V2 : MIMG_NoSampler_Src_Helper<op, asm, VReg_64, 0, 0, 1>; /* packed D16 */
+    defm _V2 : MIMG_NoSampler_Src_Helper<op, asm, AVLdSt_64, 0, 0, 1>; /* packed D16 */
     let VDataDwords = 3 in
-    defm _V3 : MIMG_NoSampler_Src_Helper<op, asm, VReg_96, 0, 0, 1>; /* packed D16 + tfe */
+    defm _V3 : MIMG_NoSampler_Src_Helper<op, asm, AVLdSt_96, 0, 0, 1>; /* packed D16 + tfe */
     let VDataDwords = 4 in
-    defm _V4 : MIMG_NoSampler_Src_Helper<op, asm, VReg_128, 1, 0, 1>;
+    defm _V4 : MIMG_NoSampler_Src_Helper<op, asm, AVLdSt_128, 1, 0, 1>;
     let VDataDwords = 5 in
-    defm _V5 : MIMG_NoSampler_Src_Helper<op, asm, VReg_160, 0, 0, 1>;
+    defm _V5 : MIMG_NoSampler_Src_Helper<op, asm, AVLdSt_160, 0, 0, 1>;
   }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
index 8d27153fcfcd..3e256cce97af 100644
--- a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -29,7 +29,7 @@ using namespace llvm;
 #include "R600GenInstrInfo.inc"
 
 R600InstrInfo::R600InstrInfo(const R600Subtarget &ST)
-  : R600GenInstrInfo(-1, -1), RI(), ST(ST) {}
+    : R600GenInstrInfo(ST, -1, -1), RI(), ST(ST) {}
 
 bool R600InstrInfo::isVector(const MachineInstr &MI) const {
   return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR;
diff --git a/llvm/lib/Target/AMDGPU/R600Instructions.td b/llvm/lib/Target/AMDGPU/R600Instructions.td
index f82bd55beccc..dda0cf6a3218 100644
--- a/llvm/lib/Target/AMDGPU/R600Instructions.td
+++ b/llvm/lib/Target/AMDGPU/R600Instructions.td
@@ -123,7 +123,6 @@ class R600_1OP <bits<11> inst, string opName, list<dag> pattern,
   let HasNativeOperands = 1;
   let Op1 = 1;
   let ALUInst = 1;
-  let DisableEncoding = "$literal";
   let UseNamedOperandTable = 1;
 
   let Inst{31-0}  = Word0;
@@ -161,7 +160,6 @@ class R600_2OP <bits<11> inst, string opName, list<dag> pattern,
   let HasNativeOperands = 1;
   let Op2 = 1;
   let ALUInst = 1;
-  let DisableEncoding = "$literal";
   let UseNamedOperandTable = 1;
 
   let Inst{31-0}  = Word0;
@@ -201,7 +199,6 @@ class R600_3OP <bits<5> inst, string opName, list<dag> pattern,
     R600ALU_Word1_OP3<inst>{
 
   let HasNativeOperands = 1;
-  let DisableEncoding = "$literal";
   let Op3 = 1;
   let UseNamedOperandTable = 1;
   let ALUInst = 1;
@@ -1783,7 +1780,7 @@ def : DwordAddrPat  <i32, R600_Reg32>;
 def getLDSNoRetOp : InstrMapping {
   let FilterClass = "R600_LDS_1A1D";
   let RowFields = ["BaseOp"];
-  let ColFields = ["DisableEncoding"];
-  let KeyCol = ["$dst"];
-  let ValueCols = [[""""]];
+  let ColFields = ["usesCustomInserter"];
+  let KeyCol = ["1"];
+  let ValueCols = [["0"]];
 }
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 268b153c6c92..ecc4659ee0e8 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -237,16 +237,16 @@ enum OperandType : unsigned {
   OPERAND_REG_INLINE_AC_FP32,
   OPERAND_REG_INLINE_AC_FP64,
 
+  // Operand for AV_MOV_B64_IMM_PSEUDO, which is a pair of 32-bit inline
+  // constants. Does not accept registers.
+  OPERAND_INLINE_C_AV64_PSEUDO,
+
   // Operand for source modifiers for VOP instructions
   OPERAND_INPUT_MODS,
 
   // Operand for SDWA instructions
   OPERAND_SDWA_VOPC_DST,
 
-  // Operand for AV_MOV_B64_IMM_PSEUDO, which is a pair of 32-bit inline
-  // constants.
-  OPERAND_INLINE_C_AV64_PSEUDO,
-
   OPERAND_REG_IMM_FIRST = OPERAND_REG_IMM_INT32,
   OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_V2FP32,
 
@@ -254,7 +254,7 @@ enum OperandType : unsigned {
   OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_AC_FP64,
 
   OPERAND_REG_INLINE_AC_FIRST = OPERAND_REG_INLINE_AC_INT32,
-  OPERAND_REG_INLINE_AC_LAST = OPERAND_REG_INLINE_AC_FP64,
+  OPERAND_REG_INLINE_AC_LAST = OPERAND_INLINE_C_AV64_PSEUDO,
 
   OPERAND_SRC_FIRST = OPERAND_REG_IMM_INT32,
   OPERAND_SRC_LAST = OPERAND_REG_INLINE_C_LAST,
@@ -354,10 +354,11 @@ enum : unsigned {
 // Register codes as defined in the TableGen's HWEncoding field.
 namespace HWEncoding {
 enum : unsigned {
-  REG_IDX_MASK = 0xff,
-  IS_VGPR = 1 << 8,
-  IS_AGPR = 1 << 9,
-  IS_HI16 = 1 << 10,
+  REG_IDX_MASK = 0x3ff,
+  LO256_REG_IDX_MASK = 0xff,
+  IS_VGPR = 1 << 10,
+  IS_AGPR = 1 << 11,
+  IS_HI16 = 1 << 12,
 };
 } // namespace HWEncoding
 
@@ -457,6 +458,8 @@ enum Id { // Message ID, width(4) [3:0].
   ID_RTN_GET_TBA_TO_PC = 134,
   ID_RTN_GET_SE_AID_ID = 135,
 
+  ID_RTN_GET_CLUSTER_BARRIER_STATE = 136, // added in GFX1250
+
   ID_MASK_PreGFX11_ = 0xF,
   ID_MASK_GFX11Plus_ = 0xFF
 };
@@ -572,7 +575,17 @@ enum ModeRegisterMasks : uint32_t {
 
   GPR_IDX_EN_MASK = 1 << 27,
   VSKIP_MASK = 1 << 28,
-  CSP_MASK = 0x7u << 29 // Bits 29..31
+  CSP_MASK = 0x7u << 29, // Bits 29..31
+
+  // GFX1250
+  DST_VGPR_MSB = 1 << 12,
+  SRC0_VGPR_MSB = 1 << 13,
+  SRC1_VGPR_MSB = 1 << 14,
+  SRC2_VGPR_MSB = 1 << 15,
+  VGPR_MSB_MASK = 0xf << 12, // Bits 12..15
+
+  REPLAY_MODE = 1 << 25,
+  FLAT_SCRATCH_IS_NV = 1 << 26,
 };
 
 } // namespace Hwreg
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index dce4e6f99300..6533d4c8eca3 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -627,6 +627,9 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) {
   TRI = ST.getRegisterInfo();
   TII = ST.getInstrInfo();
 
+  // Instructions to re-legalize after changing register classes
+  SmallVector<MachineInstr *, 8> Relegalize;
+
   for (MachineBasicBlock &MBB : MF) {
     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
          ++I) {
@@ -634,6 +637,11 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) {
 
       switch (MI.getOpcode()) {
       default:
+        // scale_src has a register class restricted to low 256 VGPRs, changing
+        // registers to VGPR may not take it into acount.
+        if (TII->isWMMA(MI) &&
+            AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::scale_src0))
+          Relegalize.push_back(&MI);
         continue;
       case AMDGPU::COPY: {
         const TargetRegisterClass *SrcRC, *DstRC;
@@ -791,6 +799,9 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) {
   for (auto *MI : PHINodes) {
     processPHINode(*MI);
   }
+  while (!Relegalize.empty())
+    TII->legalizeOperands(*Relegalize.pop_back_val(), MDT);
+
   if (MF.getTarget().getOptLevel() > CodeGenOptLevel::None && EnableM0Merge)
     hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII);
 
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 962c276bc212..5297816ec1f2 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -173,6 +173,7 @@ struct FoldCandidate {
 
 class SIFoldOperandsImpl {
 public:
+  MachineFunction *MF;
   MachineRegisterInfo *MRI;
   const SIInstrInfo *TII;
   const SIRegisterInfo *TRI;
@@ -705,6 +706,36 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
   }
 
   MachineOperand *New = Fold.Def.OpToFold;
+
+  // Verify the register is compatible with the operand.
+  if (const TargetRegisterClass *OpRC =
+          TII->getRegClass(MI->getDesc(), Fold.UseOpNo, TRI, *MF)) {
+    const TargetRegisterClass *OldRC = MRI->getRegClass(Old.getReg());
+    const TargetRegisterClass *NewRC = MRI->getRegClass(New->getReg());
+    unsigned NewSubReg = New->getSubReg();
+    unsigned OldSubReg = Old.getSubReg();
+
+    const TargetRegisterClass *ConstrainRC = OpRC;
+    if (NewSubReg && OldSubReg) {
+      unsigned PreA, PreB;
+      ConstrainRC = TRI->getCommonSuperRegClass(OpRC, OldSubReg, NewRC,
+                                                NewSubReg, PreA, PreB);
+    } else if (OldSubReg) {
+      ConstrainRC = TRI->getMatchingSuperRegClass(OldRC, OpRC, OldSubReg);
+    } else if (NewSubReg) {
+      ConstrainRC = TRI->getMatchingSuperRegClass(NewRC, OpRC, NewSubReg);
+    }
+
+    if (!ConstrainRC)
+      return false;
+
+    if (!MRI->constrainRegClass(New->getReg(), ConstrainRC)) {
+      LLVM_DEBUG(dbgs() << "Cannot constrain " << printReg(New->getReg(), TRI)
+                        << TRI->getRegClassName(ConstrainRC) << '\n');
+      return false;
+    }
+  }
+
   // Rework once the VS_16 register class is updated to include proper
   // 16-bit SGPRs instead of 32-bit ones.
   if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))
@@ -1248,6 +1279,7 @@ void SIFoldOperandsImpl::foldOperand(
   if (FoldingImmLike && UseMI->isCopy()) {
     Register DestReg = UseMI->getOperand(0).getReg();
     Register SrcReg = UseMI->getOperand(1).getReg();
+    unsigned UseSubReg = UseMI->getOperand(1).getSubReg();
     assert(SrcReg.isVirtual());
 
     const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
@@ -1259,63 +1291,74 @@ void SIFoldOperandsImpl::foldOperand(
       return;
 
     const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
-    if (!DestReg.isPhysical() && DestRC == &AMDGPU::AGPR_32RegClass) {
-      std::optional<int64_t> UseImmVal = OpToFold.getEffectiveImmVal();
-      if (UseImmVal && TII->isInlineConstant(
-                           *UseImmVal, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
-        UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
-        UseMI->getOperand(1).ChangeToImmediate(*UseImmVal);
-        CopiesToReplace.push_back(UseMI);
-        return;
+    // In order to fold immediates into copies, we need to change the copy to a
+    // MOV. Find a compatible mov instruction with the value.
+    for (unsigned MovOp :
+         {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
+          AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_MOV_B16_t16_e64,
+          AMDGPU::V_ACCVGPR_WRITE_B32_e64, AMDGPU::AV_MOV_B32_IMM_PSEUDO,
+          AMDGPU::AV_MOV_B64_IMM_PSEUDO}) {
+      const MCInstrDesc &MovDesc = TII->get(MovOp);
+      assert(MovDesc.getNumDefs() > 0 && MovDesc.operands()[0].RegClass != -1);
+
+      const TargetRegisterClass *MovDstRC =
+          TRI->getRegClass(MovDesc.operands()[0].RegClass);
+
+      // Fold if the destination register class of the MOV instruction (ResRC)
+      // is a superclass of (or equal to) the destination register class of the
+      // COPY (DestRC). If this condition fails, folding would be illegal.
+      if (!DestRC->hasSuperClassEq(MovDstRC))
+        continue;
+
+      const int SrcIdx = MovOp == AMDGPU::V_MOV_B16_t16_e64 ? 2 : 1;
+      const TargetRegisterClass *MovSrcRC =
+          TRI->getRegClass(MovDesc.operands()[SrcIdx].RegClass);
+      if (MovSrcRC) {
+        if (UseSubReg)
+          MovSrcRC = TRI->getMatchingSuperRegClass(SrcRC, MovSrcRC, UseSubReg);
+        if (!MRI->constrainRegClass(SrcReg, MovSrcRC))
+          break;
+
+        // FIXME: This is mutating the instruction only and deferring the actual
+        // fold of the immediate
+      } else {
+        // For the _IMM_PSEUDO cases, there can be value restrictions on the
+        // immediate to verify. Technically we should always verify this, but it
+        // only matters for these concrete cases.
+        // TODO: Handle non-imm case if it's useful.
+        if (!OpToFold.isImm() ||
+            !TII->isImmOperandLegal(MovDesc, 1, *OpToFold.getEffectiveImmVal()))
+          break;
       }
-    }
 
-    // Allow immediates COPYd into sgpr_lo16 to be further folded while
-    // still being legal if not further folded
-    if (DestRC == &AMDGPU::SGPR_LO16RegClass) {
-      assert(ST->useRealTrue16Insts());
-      MRI->setRegClass(DestReg, &AMDGPU::SGPR_32RegClass);
-      DestRC = &AMDGPU::SGPR_32RegClass;
+      MachineInstr::mop_iterator ImpOpI = UseMI->implicit_operands().begin();
+      MachineInstr::mop_iterator ImpOpE = UseMI->implicit_operands().end();
+      while (ImpOpI != ImpOpE) {
+        MachineInstr::mop_iterator Tmp = ImpOpI;
+        ImpOpI++;
+        UseMI->removeOperand(UseMI->getOperandNo(Tmp));
+      }
+      UseMI->setDesc(MovDesc);
+
+      if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
+        const auto &SrcOp = UseMI->getOperand(UseOpIdx);
+        MachineOperand NewSrcOp(SrcOp);
+        MachineFunction *MF = UseMI->getParent()->getParent();
+        UseMI->removeOperand(1);
+        UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers
+        UseMI->addOperand(NewSrcOp);                          // src0
+        UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // op_sel
+        UseOpIdx = SrcIdx;
+        UseOp = &UseMI->getOperand(UseOpIdx);
+      }
+      CopiesToReplace.push_back(UseMI);
+      break;
     }
 
-    // In order to fold immediates into copies, we need to change the
-    // copy to a MOV.
-
-    unsigned MovOp = TII->getMovOpcode(DestRC);
-    if (MovOp == AMDGPU::COPY)
-      return;
-
-    // Fold if the destination register class of the MOV instruction (ResRC)
-    // is a superclass of (or equal to) the destination register class of the
-    // COPY (DestRC). If this condition fails, folding would be illegal.
-    const MCInstrDesc &MovDesc = TII->get(MovOp);
-    assert(MovDesc.getNumDefs() > 0 && MovDesc.operands()[0].RegClass != -1);
-    const TargetRegisterClass *ResRC =
-        TRI->getRegClass(MovDesc.operands()[0].RegClass);
-    if (!DestRC->hasSuperClassEq(ResRC))
+    // We failed to replace the copy, so give up.
+    if (UseMI->getOpcode() == AMDGPU::COPY)
       return;
 
-    MachineInstr::mop_iterator ImpOpI = UseMI->implicit_operands().begin();
-    MachineInstr::mop_iterator ImpOpE = UseMI->implicit_operands().end();
-    while (ImpOpI != ImpOpE) {
-      MachineInstr::mop_iterator Tmp = ImpOpI;
-      ImpOpI++;
-      UseMI->removeOperand(UseMI->getOperandNo(Tmp));
-    }
-    UseMI->setDesc(TII->get(MovOp));
-
-    if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
-      const auto &SrcOp = UseMI->getOperand(UseOpIdx);
-      MachineOperand NewSrcOp(SrcOp);
-      MachineFunction *MF = UseMI->getParent()->getParent();
-      UseMI->removeOperand(1);
-      UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers
-      UseMI->addOperand(NewSrcOp);                          // src0
-      UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // op_sel
-      UseOpIdx = 2;
-      UseOp = &UseMI->getOperand(UseOpIdx);
-    }
-    CopiesToReplace.push_back(UseMI);
   } else {
     if (UseMI->isCopy() && OpToFold.isReg() &&
         UseMI->getOperand(0).getReg().isVirtual() &&
@@ -1430,30 +1473,9 @@ void SIFoldOperandsImpl::foldOperand(
       return;
   }
 
-  if (!FoldingImmLike) {
-    if (OpToFold.isReg() && ST->needsAlignedVGPRs()) {
-      // Don't fold if OpToFold doesn't hold an aligned register.
-      const TargetRegisterClass *RC =
-          TRI->getRegClassForReg(*MRI, OpToFold.getReg());
-      assert(RC);
-      if (TRI->hasVectorRegisters(RC) && OpToFold.getSubReg()) {
-        unsigned SubReg = OpToFold.getSubReg();
-        if (const TargetRegisterClass *SubRC =
-                TRI->getSubRegisterClass(RC, SubReg))
-          RC = SubRC;
-      }
-
-      if (!RC || !TRI->isProperlyAlignedRC(*RC))
-        return;
-    }
-
-    tryAddToFoldList(FoldList, UseMI, UseOpIdx, OpToFold);
-
-    // FIXME: We could try to change the instruction from 64-bit to 32-bit
-    // to enable more folding opportunities.  The shrink operands pass
-    // already does this.
-    return;
-  }
+  // FIXME: We could try to change the instruction from 64-bit to 32-bit
+  // to enable more folding opportunities.  The shrink operands pass
+  // already does this.
 
   tryAddToFoldList(FoldList, UseMI, UseOpIdx, OpToFold);
 }
@@ -1931,8 +1953,10 @@ bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const {
         // Direct copy from SGPR to AGPR is not possible on gfx908. To avoid
         // creation of exploded copies SGPR->VGPR->AGPR in the copyPhysReg()
         // later, create a copy here and track if we already have such a copy.
-        if (TRI->getSubRegisterClass(MRI->getRegClass(Src.Reg), Src.SubReg) !=
-            VGPRUseSubRC) {
+        const TargetRegisterClass *SubRC =
+            TRI->getSubRegisterClass(MRI->getRegClass(Src.Reg), Src.SubReg);
+        if (!VGPRUseSubRC->hasSubClassEq(SubRC)) {
+          // TODO: Try to reconstrain class
           VGPRCopy = MRI->createVirtualRegister(VGPRUseSubRC);
           BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), VGPRCopy).add(*Def);
           B.addReg(VGPRCopy);
@@ -2748,6 +2772,7 @@ bool SIFoldOperandsImpl::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
 }
 
 bool SIFoldOperandsImpl::run(MachineFunction &MF) {
+  this->MF = &MF;
   MRI = &MF.getRegInfo();
   ST = &MF.getSubtarget<GCNSubtarget>();
   TII = ST->getInstrInfo();
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 9b348d46fec4..ce25bf499c41 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1132,9 +1132,18 @@ void SIFrameLowering::emitCSRSpillRestores(
     RestoreWWMRegisters(WWMCalleeSavedRegs);
 
     // The original EXEC is the first operand of the return instruction.
-    const MachineInstr &Return = MBB.instr_back();
-    assert(Return.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN &&
-           "Unexpected return inst");
+    MachineInstr &Return = MBB.instr_back();
+    unsigned Opcode = Return.getOpcode();
+    switch (Opcode) {
+    case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
+      Opcode = AMDGPU::SI_RETURN;
+      break;
+    case AMDGPU::SI_TCRETURN_GFX_WholeWave:
+      Opcode = AMDGPU::SI_TCRETURN_GFX;
+      break;
+    default:
+      llvm_unreachable("Unexpected return inst");
+    }
     Register OrigExec = Return.getOperand(0).getReg();
 
     if (!WWMScratchRegs.empty()) {
@@ -1148,6 +1157,11 @@ void SIFrameLowering::emitCSRSpillRestores(
     // Restore original EXEC.
     unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
     BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addReg(OrigExec);
+
+    // Drop the first operand and update the opcode.
+    Return.removeOperand(0);
+    Return.setDesc(TII->get(Opcode));
+
     return;
   }
 
@@ -1728,7 +1742,9 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
            "Whole wave functions can use the reg mapped for their i1 argument");
 
     // FIXME: Be more efficient!
-    for (MCRegister Reg : AMDGPU::VGPR_32RegClass)
+    unsigned NumArchVGPRs = ST.has1024AddressableVGPRs() ? 1024 : 256;
+    for (MCRegister Reg :
+         AMDGPU::VGPR_32RegClass.getRegisters().take_front(NumArchVGPRs))
       if (MF.getRegInfo().isPhysRegModified(Reg)) {
         MFI->reserveWWMRegister(Reg);
         MF.begin()->addLiveIn(Reg);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 66c1dfc71c2f..2a977247bc2c 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1263,22 +1263,61 @@ MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const {
 static unsigned getIntrMemWidth(unsigned IntrID) {
   switch (IntrID) {
   case Intrinsic::amdgcn_global_load_async_to_lds_b8:
+  case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
   case Intrinsic::amdgcn_global_store_async_from_lds_b8:
     return 8;
   case Intrinsic::amdgcn_global_load_async_to_lds_b32:
+  case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
   case Intrinsic::amdgcn_global_store_async_from_lds_b32:
+  case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
+  case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
     return 32;
   case Intrinsic::amdgcn_global_load_async_to_lds_b64:
+  case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
   case Intrinsic::amdgcn_global_store_async_from_lds_b64:
+  case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
+  case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
     return 64;
   case Intrinsic::amdgcn_global_load_async_to_lds_b128:
+  case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
   case Intrinsic::amdgcn_global_store_async_from_lds_b128:
+  case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
+  case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
     return 128;
   default:
     llvm_unreachable("Unknown width");
   }
 }
 
+static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad,
+                                      TargetLoweringBase::IntrinsicInfo &Info) {
+  Value *OrderingArg = CI.getArgOperand(IsLoad ? 1 : 2);
+  unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue();
+  switch (AtomicOrderingCABI(Ord)) {
+  case AtomicOrderingCABI::acquire:
+    Info.order = AtomicOrdering::Acquire;
+    break;
+  case AtomicOrderingCABI::release:
+    Info.order = AtomicOrdering::Release;
+    break;
+  case AtomicOrderingCABI::seq_cst:
+    Info.order = AtomicOrdering::SequentiallyConsistent;
+    break;
+  default:
+    Info.order = AtomicOrdering::Monotonic;
+    break;
+  }
+
+  Info.flags =
+      (IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore);
+  Info.flags |= MOCooperative;
+
+  MDNode *ScopeMD = cast<MDNode>(
+      cast<MetadataAsValue>(CI.getArgOperand(IsLoad ? 2 : 3))->getMetadata());
+  StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString();
+  Info.ssid = CI.getContext().getOrInsertSyncScopeID(Scope);
+}
+
 bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                           const CallInst &CI,
                                           MachineFunction &MF,
@@ -1506,6 +1545,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::amdgcn_global_load_monitor_b32:
   case Intrinsic::amdgcn_global_load_monitor_b64:
   case Intrinsic::amdgcn_global_load_monitor_b128:
+  case Intrinsic::amdgcn_cluster_load_b32:
+  case Intrinsic::amdgcn_cluster_load_b64:
+  case Intrinsic::amdgcn_cluster_load_b128:
   case Intrinsic::amdgcn_ds_load_tr6_b96:
   case Intrinsic::amdgcn_ds_load_tr4_b64:
   case Intrinsic::amdgcn_ds_load_tr8_b64:
@@ -1525,6 +1567,26 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.flags |= MachineMemOperand::MOLoad;
     return true;
   }
+  case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
+  case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
+  case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
+    Info.ptrVal = CI.getOperand(0);
+    Info.align.reset();
+    getCoopAtomicOperandsInfo(CI, /*IsLoad=*/true, Info);
+    return true;
+  }
+  case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
+  case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
+  case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
+    Info.opc = ISD::INTRINSIC_VOID;
+    Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
+    Info.ptrVal = CI.getArgOperand(0);
+    Info.align.reset();
+    getCoopAtomicOperandsInfo(CI, /*IsLoad=*/false, Info);
+    return true;
+  }
   case Intrinsic::amdgcn_ds_gws_init:
   case Intrinsic::amdgcn_ds_gws_barrier:
   case Intrinsic::amdgcn_ds_gws_sema_v:
@@ -1553,7 +1615,11 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::amdgcn_global_load_async_to_lds_b8:
   case Intrinsic::amdgcn_global_load_async_to_lds_b32:
   case Intrinsic::amdgcn_global_load_async_to_lds_b64:
-  case Intrinsic::amdgcn_global_load_async_to_lds_b128: {
+  case Intrinsic::amdgcn_global_load_async_to_lds_b128:
+  case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
+  case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
+  case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
+  case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
     Info.opc = ISD::INTRINSIC_VOID;
     Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
     Info.ptrVal = CI.getArgOperand(1);
@@ -1636,6 +1702,9 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
   Value *Ptr = nullptr;
   switch (II->getIntrinsicID()) {
   case Intrinsic::amdgcn_atomic_cond_sub_u32:
+  case Intrinsic::amdgcn_cluster_load_b128:
+  case Intrinsic::amdgcn_cluster_load_b64:
+  case Intrinsic::amdgcn_cluster_load_b32:
   case Intrinsic::amdgcn_ds_append:
   case Intrinsic::amdgcn_ds_consume:
   case Intrinsic::amdgcn_ds_load_tr8_b64:
@@ -1678,6 +1747,10 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
   case Intrinsic::amdgcn_global_load_async_to_lds_b32:
   case Intrinsic::amdgcn_global_load_async_to_lds_b64:
   case Intrinsic::amdgcn_global_load_async_to_lds_b128:
+  case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
+  case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
+  case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
+  case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
     Ptr = II->getArgOperand(1);
     break;
   default:
@@ -4260,6 +4333,11 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
       break;
     }
 
+    // If the caller is a whole wave function, we need to use a special opcode
+    // so we can patch up EXEC.
+    if (Info->isWholeWaveFunction())
+      OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;
+
     return DAG.getNode(OPC, DL, MVT::Other, Ops);
   }
 
@@ -5192,7 +5270,58 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
   return LoopBB;
 }
 
-static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
+static MachineBasicBlock *Expand64BitScalarArithmetic(MachineInstr &MI,
+                                                      MachineBasicBlock *BB) {
+  // For targets older than GFX12, we emit a sequence of 32-bit operations.
+  // For GFX12, we emit s_add_u64 and s_sub_u64.
+  MachineFunction *MF = BB->getParent();
+  const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
+  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+  const DebugLoc &DL = MI.getDebugLoc();
+  MachineOperand &Dest = MI.getOperand(0);
+  MachineOperand &Src0 = MI.getOperand(1);
+  MachineOperand &Src1 = MI.getOperand(2);
+  bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
+  if (ST.hasScalarAddSub64()) {
+    unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
+    // clang-format off
+    BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
+        .add(Src0)
+        .add(Src1);
+    // clang-format on
+  } else {
+    const SIRegisterInfo *TRI = ST.getRegisterInfo();
+    const TargetRegisterClass *BoolRC = TRI->getBoolRC();
+
+    Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+    Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+    MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
+        MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
+    MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
+        MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
+
+    MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
+        MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
+    MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
+        MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
+
+    unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
+    unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
+    BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
+    BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
+    BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
+        .addReg(DestSub0)
+        .addImm(AMDGPU::sub0)
+        .addReg(DestSub1)
+        .addImm(AMDGPU::sub1);
+  }
+  MI.eraseFromParent();
+  return BB;
+}
+
+static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
   switch (Opc) {
   case AMDGPU::S_MIN_U32:
     return std::numeric_limits<uint32_t>::max();
@@ -5210,10 +5339,42 @@ static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
   case AMDGPU::S_AND_B32:
     return std::numeric_limits<uint32_t>::max();
   default:
-    llvm_unreachable("Unexpected opcode in getIdentityValueForWaveReduction");
+    llvm_unreachable(
+        "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
   }
 }
 
+static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc) {
+  switch (Opc) {
+  case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
+    return std::numeric_limits<uint64_t>::max();
+  case AMDGPU::V_CMP_LT_I64_e64: // min.i64
+    return std::numeric_limits<int64_t>::max();
+  case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
+    return std::numeric_limits<uint64_t>::min();
+  case AMDGPU::V_CMP_GT_I64_e64: // max.i64
+    return std::numeric_limits<int64_t>::min();
+  case AMDGPU::S_ADD_U64_PSEUDO:
+  case AMDGPU::S_SUB_U64_PSEUDO:
+  case AMDGPU::S_OR_B64:
+  case AMDGPU::S_XOR_B64:
+    return std::numeric_limits<uint64_t>::min();
+  case AMDGPU::S_AND_B64:
+    return std::numeric_limits<uint64_t>::max();
+  default:
+    llvm_unreachable(
+        "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
+  }
+}
+
+static bool is32bitWaveReduceOperation(unsigned Opc) {
+  return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
+         Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
+         Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
+         Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
+         Opc == AMDGPU::S_XOR_B32;
+}
+
 static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
                                           MachineBasicBlock &BB,
                                           const GCNSubtarget &ST,
@@ -5241,53 +5402,99 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
       RetBB = &BB;
       break;
     }
+    case AMDGPU::V_CMP_LT_U64_e64: // umin
+    case AMDGPU::V_CMP_LT_I64_e64: // min
+    case AMDGPU::V_CMP_GT_U64_e64: // umax
+    case AMDGPU::V_CMP_GT_I64_e64: // max
+    case AMDGPU::S_AND_B64:
+    case AMDGPU::S_OR_B64: {
+      // Idempotent operations.
+      BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B64), DstReg).addReg(SrcReg);
+      RetBB = &BB;
+      break;
+    }
     case AMDGPU::S_XOR_B32:
+    case AMDGPU::S_XOR_B64:
     case AMDGPU::S_ADD_I32:
-    case AMDGPU::S_SUB_I32: {
+    case AMDGPU::S_ADD_U64_PSEUDO:
+    case AMDGPU::S_SUB_I32:
+    case AMDGPU::S_SUB_U64_PSEUDO: {
       const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
       const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
       Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
-      Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
+      Register NumActiveLanes =
+          MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
 
       bool IsWave32 = ST.isWave32();
       unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
       MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-      unsigned CountReg =
+      unsigned BitCountOpc =
           IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
 
-      auto Exec =
-          BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
+      BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
 
-      auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
-                                .addReg(Exec->getOperand(0).getReg());
+      auto NewAccumulator =
+          BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
+              .addReg(ExecMask);
 
       switch (Opc) {
-      case AMDGPU::S_XOR_B32: {
+      case AMDGPU::S_XOR_B32:
+      case AMDGPU::S_XOR_B64: {
         // Performing an XOR operation on a uniform value
         // depends on the parity of the number of active lanes.
         // For even parity, the result will be 0, for odd
         // parity the result will be the same as the input value.
-        Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
-
-        auto ParityReg =
-            BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
-                .addReg(NewAccumulator->getOperand(0).getReg())
-                .addImm(1);
-        BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
-            .addReg(SrcReg)
-            .addReg(ParityReg->getOperand(0).getReg());
+        Register ParityRegister =
+            MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+        BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
+            .addReg(NewAccumulator->getOperand(0).getReg())
+            .addImm(1)
+            .setOperandDead(3); // Dead scc
+        if (Opc == AMDGPU::S_XOR_B32) {
+          BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
+              .addReg(SrcReg)
+              .addReg(ParityRegister);
+        } else {
+          Register DestSub0 =
+              MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+          Register DestSub1 =
+              MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+          const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
+          const TargetRegisterClass *SrcSubRC =
+              TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
+
+          MachineOperand Op1L = TII->buildExtractSubRegOrImm(
+              MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
+          MachineOperand Op1H = TII->buildExtractSubRegOrImm(
+              MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
+
+          BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
+              .add(Op1L)
+              .addReg(ParityRegister);
+
+          BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
+              .add(Op1H)
+              .addReg(ParityRegister);
+
+          BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
+              .addReg(DestSub0)
+              .addImm(AMDGPU::sub0)
+              .addReg(DestSub1)
+              .addImm(AMDGPU::sub1);
+        }
         break;
       }
       case AMDGPU::S_SUB_I32: {
         Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
 
         // Take the negation of the source operand.
-        auto InvertedValReg =
-            BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
-                .addImm(-1)
-                .addReg(SrcReg);
+        BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
+            .addImm(0)
+            .addReg(SrcReg);
         BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
-            .addReg(InvertedValReg->getOperand(0).getReg())
+            .addReg(NegatedVal)
             .addReg(NewAccumulator->getOperand(0).getReg());
         break;
       }
@@ -5297,6 +5504,75 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
             .addReg(NewAccumulator->getOperand(0).getReg());
         break;
       }
+      case AMDGPU::S_ADD_U64_PSEUDO:
+      case AMDGPU::S_SUB_U64_PSEUDO: {
+        Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+        Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+        Register Op1H_Op0L_Reg =
+            MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+        Register Op1L_Op0H_Reg =
+            MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+        Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+        Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+        Register NegatedValLo =
+            MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+        Register NegatedValHi =
+            MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+        const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
+        const TargetRegisterClass *Src1SubRC =
+            TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
+
+        MachineOperand Op1L = TII->buildExtractSubRegOrImm(
+            MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
+        MachineOperand Op1H = TII->buildExtractSubRegOrImm(
+            MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
+
+        if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
+          BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
+              .addImm(0)
+              .addReg(NewAccumulator->getOperand(0).getReg())
+              .setOperandDead(3); // Dead scc
+          BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
+              .addReg(NegatedValLo)
+              .addImm(31)
+              .setOperandDead(3); // Dead scc
+          BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
+              .add(Op1L)
+              .addReg(NegatedValHi);
+        }
+        Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
+                                 ? NegatedValLo
+                                 : NewAccumulator->getOperand(0).getReg();
+        BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
+            .add(Op1L)
+            .addReg(LowOpcode);
+        BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
+            .add(Op1L)
+            .addReg(LowOpcode);
+        BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
+            .add(Op1H)
+            .addReg(LowOpcode);
+
+        Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
+        BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
+            .addReg(CarryReg)
+            .addReg(Op1H_Op0L_Reg)
+            .setOperandDead(3); // Dead scc
+
+        if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
+          BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
+              .addReg(HiVal)
+              .addReg(Op1L_Op0H_Reg)
+              .setOperandDead(3); // Dead scc
+        }
+        BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
+            .addReg(DestSub0)
+            .addImm(AMDGPU::sub0)
+            .addReg(DestSub1)
+            .addImm(AMDGPU::sub1);
+        break;
+      }
       }
       RetBB = &BB;
     }
@@ -5313,6 +5589,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
     // so that we will get the next active lane for next iteration.
     MachineBasicBlock::iterator I = BB.end();
     Register SrcReg = MI.getOperand(1).getReg();
+    bool is32BitOpc = is32bitWaveReduceOperation(Opc);
 
     // Create Control flow for loop
     // Split MI's Machine Basic block into For loop
@@ -5322,73 +5599,160 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
     const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
     const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
     Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
-    Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
-
+    Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
     Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
     Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
     Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
-
-    Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
-    Register LaneValueReg =
-        MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+    Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+    Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
 
     bool IsWave32 = ST.isWave32();
-    unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+    unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
     unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
 
     // Create initial values of induction variable from Exec, Accumulator and
     // insert branch instr to newly created ComputeBlock
-    uint32_t InitalValue = getIdentityValueForWaveReduction(Opc);
-    auto TmpSReg =
-        BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
-    BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
-        .addImm(InitalValue);
+    BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
+    if (is32BitOpc) {
+      uint32_t IdentityValue = getIdentityValueFor32BitWaveReduction(Opc);
+      BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
+          .addImm(IdentityValue);
+    } else {
+      uint64_t IdentityValue = getIdentityValueFor64BitWaveReduction(Opc);
+      BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
+          .addImm(IdentityValue);
+    }
     // clang-format off
     BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
         .addMBB(ComputeLoop);
     // clang-format on
 
     // Start constructing ComputeLoop
-    I = ComputeLoop->end();
+    I = ComputeLoop->begin();
     auto Accumulator =
         BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
-            .addReg(InitalValReg)
+            .addReg(IdentityValReg)
             .addMBB(&BB);
     auto ActiveBits =
         BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
-            .addReg(TmpSReg->getOperand(0).getReg())
+            .addReg(LoopIterator)
             .addMBB(&BB);
 
+    I = ComputeLoop->end();
+    MachineInstr *NewAccumulator;
     // Perform the computations
     unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
-    auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
-                   .addReg(ActiveBits->getOperand(0).getReg());
-    auto LaneValue = BuildMI(*ComputeLoop, I, DL,
-                             TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
-                         .addReg(SrcReg)
-                         .addReg(FF1->getOperand(0).getReg());
-    auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
-                              .addReg(Accumulator->getOperand(0).getReg())
-                              .addReg(LaneValue->getOperand(0).getReg());
-
+    BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
+        .addReg(ActiveBitsReg);
+    if (is32BitOpc) {
+      BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
+              LaneValueReg)
+          .addReg(SrcReg)
+          .addReg(FF1Reg);
+      NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
+                           .addReg(Accumulator->getOperand(0).getReg())
+                           .addReg(LaneValueReg);
+    } else {
+      Register LaneValueLoReg =
+          MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+      Register LaneValueHiReg =
+          MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+      Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+      const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
+      const TargetRegisterClass *SrcSubRC =
+          TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
+      MachineOperand Op1L = TII->buildExtractSubRegOrImm(
+          MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
+      MachineOperand Op1H = TII->buildExtractSubRegOrImm(
+          MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
+      // lane value input should be in an sgpr
+      BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
+              LaneValueLoReg)
+          .add(Op1L)
+          .addReg(FF1Reg);
+      BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
+              LaneValueHiReg)
+          .add(Op1H)
+          .addReg(FF1Reg);
+      auto LaneValue = BuildMI(*ComputeLoop, I, DL,
+                               TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
+                           .addReg(LaneValueLoReg)
+                           .addImm(AMDGPU::sub0)
+                           .addReg(LaneValueHiReg)
+                           .addImm(AMDGPU::sub1);
+      switch (Opc) {
+      case AMDGPU::S_OR_B64:
+      case AMDGPU::S_AND_B64:
+      case AMDGPU::S_XOR_B64: {
+        NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
+                             .addReg(Accumulator->getOperand(0).getReg())
+                             .addReg(LaneValue->getOperand(0).getReg())
+                             .setOperandDead(3); // Dead scc
+        break;
+      }
+      case AMDGPU::V_CMP_GT_I64_e64:
+      case AMDGPU::V_CMP_GT_U64_e64:
+      case AMDGPU::V_CMP_LT_I64_e64:
+      case AMDGPU::V_CMP_LT_U64_e64: {
+        Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
+        Register ComparisonResultReg =
+            MRI.createVirtualRegister(WaveMaskRegClass);
+        const TargetRegisterClass *VregClass = TRI->getVGPR64Class();
+        const TargetRegisterClass *VSubRegClass =
+            TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
+        Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
+        MachineOperand SrcReg0Sub0 =
+            TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
+                                         VregClass, AMDGPU::sub0, VSubRegClass);
+        MachineOperand SrcReg0Sub1 =
+            TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
+                                         VregClass, AMDGPU::sub1, VSubRegClass);
+        BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
+                AccumulatorVReg)
+            .add(SrcReg0Sub0)
+            .addImm(AMDGPU::sub0)
+            .add(SrcReg0Sub1)
+            .addImm(AMDGPU::sub1);
+        BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
+            .addReg(LaneValue->getOperand(0).getReg())
+            .addReg(AccumulatorVReg);
+
+        unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
+        BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
+            .addReg(LaneMaskReg)
+            .addReg(ActiveBitsReg);
+
+        NewAccumulator = BuildMI(*ComputeLoop, I, DL,
+                                 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
+                             .addReg(LaneValue->getOperand(0).getReg())
+                             .addReg(Accumulator->getOperand(0).getReg());
+        break;
+      }
+      case AMDGPU::S_ADD_U64_PSEUDO:
+      case AMDGPU::S_SUB_U64_PSEUDO: {
+        NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
+                             .addReg(Accumulator->getOperand(0).getReg())
+                             .addReg(LaneValue->getOperand(0).getReg());
+        ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
+        break;
+      }
+      }
+    }
     // Manipulate the iterator to get the next active lane
     unsigned BITSETOpc =
         IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
-    auto NewActiveBits =
-        BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
-            .addReg(FF1->getOperand(0).getReg())
-            .addReg(ActiveBits->getOperand(0).getReg());
+    BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
+        .addReg(FF1Reg)
+        .addReg(ActiveBitsReg);
 
     // Add phi nodes
-    Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
-        .addMBB(ComputeLoop);
-    ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
-        .addMBB(ComputeLoop);
+    Accumulator.addReg(DstReg).addMBB(ComputeLoop);
+    ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
 
     // Creating branching
     unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
     BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
-        .addReg(NewActiveBits->getOperand(0).getReg())
+        .addReg(NewActiveBitsReg)
         .addImm(0);
     BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
         .addMBB(ComputeLoop);
@@ -5410,22 +5774,40 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   switch (MI.getOpcode()) {
   case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
+  case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
   case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
+  case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
   case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
+  case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
   case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
+  case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
   case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
+  case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
   case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
+  case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
   case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
+  case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
   case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
+  case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
   case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
+  case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
   case AMDGPU::S_UADDO_PSEUDO:
   case AMDGPU::S_USUBO_PSEUDO: {
     const DebugLoc &DL = MI.getDebugLoc();
@@ -5452,55 +5834,7 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   }
   case AMDGPU::S_ADD_U64_PSEUDO:
   case AMDGPU::S_SUB_U64_PSEUDO: {
-    // For targets older than GFX12, we emit a sequence of 32-bit operations.
-    // For GFX12, we emit s_add_u64 and s_sub_u64.
-    const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
-    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
-    const DebugLoc &DL = MI.getDebugLoc();
-    MachineOperand &Dest = MI.getOperand(0);
-    MachineOperand &Src0 = MI.getOperand(1);
-    MachineOperand &Src1 = MI.getOperand(2);
-    bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
-    if (Subtarget->hasScalarAddSub64()) {
-      unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
-      // clang-format off
-      BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
-          .add(Src0)
-          .add(Src1);
-      // clang-format on
-    } else {
-      const SIRegisterInfo *TRI = ST.getRegisterInfo();
-      const TargetRegisterClass *BoolRC = TRI->getBoolRC();
-
-      Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-      Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-
-      MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
-          MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
-      MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
-          MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
-
-      MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
-          MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
-      MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
-          MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
-
-      unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
-      unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
-      BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
-          .add(Src0Sub0)
-          .add(Src1Sub0);
-      BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
-          .add(Src0Sub1)
-          .add(Src1Sub1);
-      BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
-          .addReg(DestSub0)
-          .addImm(AMDGPU::sub0)
-          .addReg(DestSub1)
-          .addImm(AMDGPU::sub1);
-    }
-    MI.eraseFromParent();
-    return BB;
+    return Expand64BitScalarArithmetic(MI, BB);
   }
   case AMDGPU::V_ADD_U64_PSEUDO:
   case AMDGPU::V_SUB_U64_PSEUDO: {
@@ -6023,14 +6357,15 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     MI.eraseFromParent();
     return SplitBB;
   }
+  case AMDGPU::SI_TCRETURN_GFX_WholeWave:
   case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
     assert(MFI->isWholeWaveFunction());
 
     // During ISel, it's difficult to propagate the original EXEC mask to use as
     // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
     MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
-    Register OriginalExec = Setup->getOperand(0).getReg();
     assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
+    Register OriginalExec = Setup->getOperand(0).getReg();
     MF->getRegInfo().clearKillFlags(OriginalExec);
     MI.getOperand(0).setReg(OriginalExec);
     return BB;
@@ -10246,6 +10581,16 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
     auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
     return SDValue(NewMI, 0);
   }
+  case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
+  case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
+  case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
+    MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
+    SDValue Chain = Op->getOperand(0);
+    SDValue Ptr = Op->getOperand(2);
+    EVT VT = Op->getValueType(0);
+    return DAG.getAtomicLoad(ISD::NON_EXTLOAD, DL, MII->getMemoryVT(), VT,
+                             Chain, Ptr, MII->getMemOperand());
+  }
   default:
 
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
@@ -10421,41 +10766,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
     return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
   }
-  case Intrinsic::amdgcn_s_barrier:
-  case Intrinsic::amdgcn_s_barrier_signal:
-  case Intrinsic::amdgcn_s_barrier_wait: {
-    const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-    if (getTargetMachine().getOptLevel() > CodeGenOptLevel::None) {
-      unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
-      if (WGSize <= ST.getWavefrontSize()) {
-        // If the workgroup fits in a wave, remove s_barrier_signal and lower
-        // s_barrier/s_barrier_wait to wave_barrier.
-        if (IntrinsicID == Intrinsic::amdgcn_s_barrier_signal)
-          return Op.getOperand(0);
-        else
-          return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL,
-                                            MVT::Other, Op.getOperand(0)),
-                         0);
-      }
-    }
-
-    if (ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
-      // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
-      SDValue K =
-          DAG.getSignedTargetConstant(AMDGPU::Barrier::WORKGROUP, DL, MVT::i32);
-      SDValue BarSignal =
-          SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
-                                     MVT::Other, K, Op.getOperand(0)),
-                  0);
-      SDValue BarWait =
-          SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
-                                     BarSignal.getValue(0)),
-                  0);
-      return BarWait;
-    }
-
-    return SDValue();
-  };
 
   case Intrinsic::amdgcn_struct_tbuffer_store:
   case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
@@ -10913,6 +11223,16 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
                                    Op->getVTList(), Ops, M->getMemoryVT(),
                                    M->getMemOperand());
   }
+  case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
+  case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
+  case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
+    MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
+    SDValue Chain = Op->getOperand(0);
+    SDValue Ptr = Op->getOperand(2);
+    SDValue Val = Op->getOperand(3);
+    return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MII->getMemoryVT(), Chain, Val,
+                         Ptr, MII->getMemOperand());
+  }
   default: {
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
             AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
@@ -16933,10 +17253,12 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
       switch (BitWidth) {
       case 16:
         RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
-                                             : &AMDGPU::VGPR_32RegClass;
+                                             : &AMDGPU::VGPR_32_Lo256RegClass;
         break;
       default:
-        RC = TRI->getVGPRClassForBitWidth(BitWidth);
+        RC = Subtarget->has1024AddressableVGPRs()
+                 ? TRI->getAlignedLo256VGPRClassForBitWidth(BitWidth)
+                 : TRI->getVGPRClassForBitWidth(BitWidth);
         if (!RC)
           return std::pair(0U, nullptr);
         break;
@@ -16980,7 +17302,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
   auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
   if (Kind != '\0') {
     if (Kind == 'v') {
-      RC = &AMDGPU::VGPR_32RegClass;
+      RC = &AMDGPU::VGPR_32_Lo256RegClass;
     } else if (Kind == 's') {
       RC = &AMDGPU::SGPR_32RegClass;
     } else if (Kind == 'a') {
@@ -17022,6 +17344,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
         return std::pair(0U, nullptr);
       if (Idx < RC->getNumRegs())
         return std::pair(RC->getRegister(Idx), RC);
+      return std::pair(0U, nullptr);
     }
   }
 
@@ -17808,11 +18131,19 @@ static bool flatInstrMayAccessPrivate(const Instruction *I) {
          !AMDGPU::hasValueInRangeLikeMetadata(*MD, AMDGPUAS::PRIVATE_ADDRESS);
 }
 
+static TargetLowering::AtomicExpansionKind
+getPrivateAtomicExpansionKind(const GCNSubtarget &STI) {
+  // For GAS, lower to flat atomic.
+  return STI.hasGloballyAddressableScratch()
+             ? TargetLowering::AtomicExpansionKind::CustomExpand
+             : TargetLowering::AtomicExpansionKind::NotAtomic;
+}
+
 TargetLowering::AtomicExpansionKind
 SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
   unsigned AS = RMW->getPointerAddressSpace();
   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
-    return AtomicExpansionKind::NotAtomic;
+    return getPrivateAtomicExpansionKind(*getSubtarget());
 
   // 64-bit flat atomics that dynamically reside in private memory will silently
   // be dropped.
@@ -17823,7 +18154,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
   if (AS == AMDGPUAS::FLAT_ADDRESS &&
       DL.getTypeSizeInBits(RMW->getType()) == 64 &&
       flatInstrMayAccessPrivate(RMW))
-    return AtomicExpansionKind::Expand;
+    return AtomicExpansionKind::CustomExpand;
 
   auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
     OptimizationRemarkEmitter ORE(RMW->getFunction());
@@ -17898,7 +18229,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
         // does. InstCombine transforms these with 0 to or, so undo that.
         if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
             ConstVal && ConstVal->isNullValue())
-          return AtomicExpansionKind::Expand;
+          return AtomicExpansionKind::CustomExpand;
       }
 
       // If the allocation could be in remote, fine-grained memory, the rmw
@@ -18027,9 +18358,9 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
         // fadd.
         if (Subtarget->hasLDSFPAtomicAddF32()) {
           if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
-            return AtomicExpansionKind::Expand;
+            return AtomicExpansionKind::CustomExpand;
           if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
-            return AtomicExpansionKind::Expand;
+            return AtomicExpansionKind::CustomExpand;
         }
       }
     }
@@ -18083,14 +18414,14 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
 TargetLowering::AtomicExpansionKind
 SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
   return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
-             ? AtomicExpansionKind::NotAtomic
+             ? getPrivateAtomicExpansionKind(*getSubtarget())
              : AtomicExpansionKind::None;
 }
 
 TargetLowering::AtomicExpansionKind
 SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
   return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
-             ? AtomicExpansionKind::NotAtomic
+             ? getPrivateAtomicExpansionKind(*getSubtarget())
              : AtomicExpansionKind::None;
 }
 
@@ -18098,7 +18429,7 @@ TargetLowering::AtomicExpansionKind
 SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
   unsigned AddrSpace = CmpX->getPointerAddressSpace();
   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
-    return AtomicExpansionKind::NotAtomic;
+    return getPrivateAtomicExpansionKind(*getSubtarget());
 
   if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
     return AtomicExpansionKind::None;
@@ -18109,7 +18440,7 @@ SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
 
   // If a 64-bit flat atomic may alias private, we need to avoid using the
   // atomic in the private case.
-  return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand
+  return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::CustomExpand
                                            : AtomicExpansionKind::None;
 }
 
@@ -18468,9 +18799,24 @@ void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
   Builder.CreateBr(ExitBB);
 }
 
+static void convertScratchAtomicToFlatAtomic(Instruction *I,
+                                             unsigned PtrOpIdx) {
+  Value *PtrOp = I->getOperand(PtrOpIdx);
+  assert(PtrOp->getType()->getPointerAddressSpace() ==
+         AMDGPUAS::PRIVATE_ADDRESS);
+
+  Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
+  Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
+                                              I->getIterator());
+  I->setOperand(PtrOpIdx, ASCast);
+}
+
 void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
   AtomicRMWInst::BinOp Op = AI->getOperation();
 
+  if (AI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+    return convertScratchAtomicToFlatAtomic(AI, AI->getPointerOperandIndex());
+
   if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
       Op == AtomicRMWInst::Xor) {
     if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
@@ -18493,9 +18839,28 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
 }
 
 void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
+  if (CI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+    return convertScratchAtomicToFlatAtomic(CI, CI->getPointerOperandIndex());
+
   emitExpandAtomicAddrSpacePredicate(CI);
 }
 
+void SITargetLowering::emitExpandAtomicLoad(LoadInst *LI) const {
+  if (LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+    return convertScratchAtomicToFlatAtomic(LI, LI->getPointerOperandIndex());
+
+  llvm_unreachable(
+      "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
+}
+
+void SITargetLowering::emitExpandAtomicStore(StoreInst *SI) const {
+  if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+    return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
+
+  llvm_unreachable(
+      "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
+}
+
 LoadInst *
 SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
   IRBuilder<> Builder(AI);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index dedd9ae17077..728c6490bdfd 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -14,8 +14,8 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_SIISELLOWERING_H
 #define LLVM_LIB_TARGET_AMDGPU_SIISELLOWERING_H
 
-#include "AMDGPUISelLowering.h"
 #include "AMDGPUArgumentUsageInfo.h"
+#include "AMDGPUISelLowering.h"
 #include "llvm/CodeGen/MachineFunction.h"
 
 namespace llvm {
@@ -562,6 +562,8 @@ public:
   void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const;
   void emitExpandAtomicRMW(AtomicRMWInst *AI) const override;
   void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override;
+  void emitExpandAtomicLoad(LoadInst *LI) const override;
+  void emitExpandAtomicStore(StoreInst *SI) const override;
 
   LoadInst *
   lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index e3a2efdd3856..b163a274396f 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -152,7 +152,7 @@ static constexpr StringLiteral WaitEventTypeName[] = {
 // We reserve a fixed number of VGPR slots in the scoring tables for
 // special tokens like SCMEM_LDS (needed for buffer load to LDS).
 enum RegisterMapping {
-  SQ_MAX_PGM_VGPRS = 1024, // Maximum programmable VGPRs across all targets.
+  SQ_MAX_PGM_VGPRS = 2048, // Maximum programmable VGPRs across all targets.
   AGPR_OFFSET = 512,       // Maximum programmable ArchVGPRs across all targets.
   SQ_MAX_PGM_SGPRS = 128,  // Maximum programmable SGPRs across all targets.
   // Artificial register slots to track LDS writes into specific LDS locations
@@ -831,7 +831,6 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
 
   MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *Context->ST);
   unsigned RegIdx = TRI->getHWRegIndex(MCReg);
-  assert(isUInt<8>(RegIdx));
 
   const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());
   unsigned Size = TRI->getRegSizeInBits(*RC);
@@ -839,7 +838,7 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
   // AGPRs/VGPRs are tracked every 16 bits, SGPRs by 32 bits
   if (TRI->isVectorRegister(*MRI, Op.getReg())) {
     unsigned Reg = RegIdx << 1 | (AMDGPU::isHi16Reg(MCReg, *TRI) ? 1 : 0);
-    assert(Reg < AGPR_OFFSET);
+    assert(!Context->ST->hasMAIInsts() || Reg < AGPR_OFFSET);
     Result.first = Reg;
     if (TRI->isAGPR(*MRI, Op.getReg()))
       Result.first += AGPR_OFFSET;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 69708c47f6c9..398c99b3bd12 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -62,8 +62,8 @@ static cl::opt<bool> Fix16BitCopies(
   cl::ReallyHidden);
 
 SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
-  : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
-    RI(ST), ST(ST) {
+    : AMDGPUGenInstrInfo(ST, AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
+      RI(ST), ST(ST) {
   SchedModel.init(&ST);
 }
 
@@ -2493,7 +2493,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
     break;
   }
-  case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
   case AMDGPU::SI_RETURN: {
     const MachineFunction *MF = MBB.getParent();
     const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
@@ -3444,12 +3443,8 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) {
   case AMDGPU::V_ACCVGPR_READ_B32_e64:
   case AMDGPU::V_ACCVGPR_MOV_B32:
   case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
-    return true;
   case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
-    // TODO: We could fold this, but it's a strange case. The immediate value
-    // can't be directly folded into any real use. We would have to spread new
-    // immediate legality checks around and only accept subregister extracts for
-    // profitability.
+    return true;
   default:
     return false;
   }
@@ -3559,13 +3554,12 @@ static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
 
 bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
                                 Register Reg, MachineRegisterInfo *MRI) const {
-  if (!MRI->hasOneNonDBGUse(Reg))
-    return false;
-
   int64_t Imm;
   if (!getConstValDefinedInReg(DefMI, Reg, Imm))
     return false;
 
+  const bool HasMultipleUses = !MRI->hasOneNonDBGUse(Reg);
+
   assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
 
   unsigned Opc = UseMI.getOpcode();
@@ -3577,6 +3571,25 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
 
     const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);
 
+    if (HasMultipleUses) {
+      // TODO: This should fold in more cases with multiple use, but we need to
+      // more carefully consider what those uses are.
+      unsigned ImmDefSize = RI.getRegSizeInBits(*MRI->getRegClass(Reg));
+
+      // Avoid breaking up a 64-bit inline immediate into a subregister extract.
+      if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64)
+        return false;
+
+      // Most of the time folding a 32-bit inline constant is free (though this
+      // might not be true if we can't later fold it into a real user).
+      //
+      // FIXME: This isInlineConstant check is imprecise if
+      // getConstValDefinedInReg handled the tricky non-mov cases.
+      if (ImmDefSize == 32 &&
+          !isInlineConstant(Imm, AMDGPU::OPERAND_REG_IMM_INT32))
+        return false;
+    }
+
     bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
                    RI.getSubRegIdxSize(UseSubReg) == 16;
 
@@ -3664,6 +3677,9 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
     return true;
   }
 
+  if (HasMultipleUses)
+    return false;
+
   if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
       Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
       Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
@@ -4572,34 +4588,43 @@ static bool compareMachineOp(const MachineOperand &Op0,
   }
 }
 
-bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
-                                    const MachineOperand &MO) const {
-  const MCInstrDesc &InstDesc = MI.getDesc();
-  const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
-
-  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
-
+bool SIInstrInfo::isLiteralOperandLegal(const MCInstrDesc &InstDesc,
+                                        const MCOperandInfo &OpInfo) const {
   if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
     return true;
 
-  if (OpInfo.RegClass < 0)
+  if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
     return false;
 
-  if (MO.isImm() && isInlineConstant(MO, OpInfo)) {
-    if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() &&
-        OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(),
-                                                    AMDGPU::OpName::src2))
+  if (!isVOP3(InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))
+    return true;
+
+  return ST.hasVOP3Literal();
+}
+
+bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
+                                    int64_t ImmVal) const {
+  const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
+  if (isInlineConstant(ImmVal, OpInfo.OperandType)) {
+    if (isMAI(InstDesc) && ST.hasMFMAInlineLiteralBug() &&
+        OpNo == (unsigned)AMDGPU::getNamedOperandIdx(InstDesc.getOpcode(),
+                                                     AMDGPU::OpName::src2))
       return false;
     return RI.opCanUseInlineConstant(OpInfo.OperandType);
   }
 
-  if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
-    return false;
+  return isLiteralOperandLegal(InstDesc, OpInfo);
+}
 
-  if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo))
-    return true;
+bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
+                                    const MachineOperand &MO) const {
+  if (MO.isImm())
+    return isImmOperandLegal(InstDesc, OpNo, MO.getImm());
 
-  return ST.hasVOP3Literal();
+  assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&
+         "unexpected imm-like operand kind");
+  const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
+  return isLiteralOperandLegal(InstDesc, OpInfo);
 }
 
 bool SIInstrInfo::isLegalAV64PseudoImm(uint64_t Imm) const {
@@ -4759,6 +4784,31 @@ MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
   return Inst32;
 }
 
+bool SIInstrInfo::physRegUsesConstantBus(const MachineOperand &RegOp) const {
+  // Null is free
+  Register Reg = RegOp.getReg();
+  if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)
+    return false;
+
+  // SGPRs use the constant bus
+
+  // FIXME: implicit registers that are not part of the MCInstrDesc's implicit
+  // physical register operands should also count, except for exec.
+  if (RegOp.isImplicit())
+    return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;
+
+  // SGPRs use the constant bus
+  return AMDGPU::SReg_32RegClass.contains(Reg) ||
+         AMDGPU::SReg_64RegClass.contains(Reg);
+}
+
+bool SIInstrInfo::regUsesConstantBus(const MachineOperand &RegOp,
+                                     const MachineRegisterInfo &MRI) const {
+  Register Reg = RegOp.getReg();
+  return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
+                         : physRegUsesConstantBus(RegOp);
+}
+
 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
                                   const MachineOperand &MO,
                                   const MCOperandInfo &OpInfo) const {
@@ -4766,23 +4816,9 @@ bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
   if (!MO.isReg())
     return !isInlineConstant(MO, OpInfo);
 
-  if (!MO.isUse())
-    return false;
-
-  if (MO.getReg().isVirtual())
-    return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
-
-  // Null is free
-  if (MO.getReg() == AMDGPU::SGPR_NULL || MO.getReg() == AMDGPU::SGPR_NULL64)
-    return false;
-
-  // SGPRs use the constant bus
-  if (MO.isImplicit()) {
-    return MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC ||
-           MO.getReg() == AMDGPU::VCC_LO;
-  }
-  return AMDGPU::SReg_32RegClass.contains(MO.getReg()) ||
-         AMDGPU::SReg_64RegClass.contains(MO.getReg());
+  Register Reg = MO.getReg();
+  return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
+                         : physRegUsesConstantBus(MO);
 }
 
 static Register findImplicitSGPRRead(const MachineInstr &MI) {
@@ -4933,7 +4969,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
 
     int RegClass = Desc.operands()[i].RegClass;
 
-    switch (Desc.operands()[i].OperandType) {
+    const MCOperandInfo &OpInfo = Desc.operands()[i];
+    switch (OpInfo.OperandType) {
     case MCOI::OPERAND_REGISTER:
       if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
         ErrInfo = "Illegal immediate value for operand.";
@@ -4941,15 +4978,31 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
       }
       break;
     case AMDGPU::OPERAND_REG_IMM_INT32:
+    case AMDGPU::OPERAND_REG_IMM_INT64:
+    case AMDGPU::OPERAND_REG_IMM_INT16:
     case AMDGPU::OPERAND_REG_IMM_FP32:
     case AMDGPU::OPERAND_REG_IMM_V2FP32:
+    case AMDGPU::OPERAND_REG_IMM_BF16:
+    case AMDGPU::OPERAND_REG_IMM_FP16:
+    case AMDGPU::OPERAND_REG_IMM_FP64:
+    case AMDGPU::OPERAND_REG_IMM_V2FP16:
+    case AMDGPU::OPERAND_REG_IMM_V2INT16:
+    case AMDGPU::OPERAND_REG_IMM_V2INT32:
+    case AMDGPU::OPERAND_REG_IMM_V2BF16:
+      break;
+    case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
+      break;
       break;
+    case AMDGPU::OPERAND_REG_INLINE_C_INT16:
     case AMDGPU::OPERAND_REG_INLINE_C_INT32:
-    case AMDGPU::OPERAND_REG_INLINE_C_FP32:
     case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+    case AMDGPU::OPERAND_REG_INLINE_C_FP32:
     case AMDGPU::OPERAND_REG_INLINE_C_FP64:
-    case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+    case AMDGPU::OPERAND_REG_INLINE_C_BF16:
     case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+    case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+    case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
+    case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
     case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
     case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
     case AMDGPU::OPERAND_REG_INLINE_AC_FP64: {
@@ -4965,6 +5018,10 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
         return false;
       }
       break;
+    case AMDGPU::OPERAND_INPUT_MODS:
+    case AMDGPU::OPERAND_SDWA_VOPC_DST:
+    case AMDGPU::OPERAND_KIMM16:
+      break;
     case MCOI::OPERAND_IMMEDIATE:
     case AMDGPU::OPERAND_KIMM32:
     case AMDGPU::OPERAND_KIMM64:
@@ -4976,9 +5033,15 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
         ErrInfo = "Expected immediate, but got non-immediate";
         return false;
       }
-      [[fallthrough]];
+      break;
+    case MCOI::OPERAND_UNKNOWN:
+    case MCOI::OPERAND_MEMORY:
+    case MCOI::OPERAND_PCREL:
+      break;
     default:
-      continue;
+      if (OpInfo.isGenericType())
+        continue;
+      break;
     }
 
     if (!MO.isReg())
@@ -4991,7 +5054,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     // aligned register constraint.
     // FIXME: We do not verify inline asm operands, but custom inline asm
     // verification is broken anyway
-    if (ST.needsAlignedVGPRs()) {
+    if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO) {
       const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
       if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
         if (const TargetRegisterClass *SubRC =
@@ -5912,13 +5975,12 @@ SIInstrInfo::getWholeWaveFunctionSetup(MachineFunction &MF) const {
 
 static const TargetRegisterClass *
 adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI,
-                          const MachineRegisterInfo &MRI,
                           const MCInstrDesc &TID, unsigned RCID,
                           bool IsAllocatable) {
-  if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
+  if ((IsAllocatable || !ST.hasGFX90AInsts()) &&
       (((TID.mayLoad() || TID.mayStore()) &&
         !(TID.TSFlags & SIInstrFlags::Spill)) ||
-       (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) {
+       (TID.TSFlags & SIInstrFlags::MIMG))) {
     switch (RCID) {
     case AMDGPU::AV_32RegClassID:
       RCID = AMDGPU::VGPR_32RegClassID;
@@ -5953,44 +6015,31 @@ const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID,
   if (OpNum >= TID.getNumOperands())
     return nullptr;
   auto RegClass = TID.operands()[OpNum].RegClass;
-  bool IsAllocatable = false;
-  if (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::FLAT)) {
-    // vdst and vdata should be both VGPR or AGPR, same for the DS instructions
-    // with two data operands. Request register class constrained to VGPR only
-    // of both operands present as Machine Copy Propagation can not check this
-    // constraint and possibly other passes too.
-    //
-    // The check is limited to FLAT and DS because atomics in non-flat encoding
-    // have their vdst and vdata tied to be the same register.
-    const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
-                                                   AMDGPU::OpName::vdst);
-    const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
-        (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
-                                         : AMDGPU::OpName::vdata);
-    if (DataIdx != -1) {
-      IsAllocatable = VDstIdx != -1 || AMDGPU::hasNamedOperand(
-                                           TID.Opcode, AMDGPU::OpName::data1);
-    }
+  if (TID.getOpcode() == AMDGPU::AV_MOV_B64_IMM_PSEUDO) {
+    // Special pseudos have no alignment requirement
+    return RI.getRegClass(RegClass);
   }
-  return adjustAllocatableRegClass(ST, RI, MF.getRegInfo(), TID, RegClass,
-                                   IsAllocatable);
+
+  return adjustAllocatableRegClass(ST, RI, TID, RegClass, false);
 }
 
 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
                                                       unsigned OpNo) const {
-  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
   const MCInstrDesc &Desc = get(MI.getOpcode());
   if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
       Desc.operands()[OpNo].RegClass == -1) {
     Register Reg = MI.getOperand(OpNo).getReg();
 
-    if (Reg.isVirtual())
+    if (Reg.isVirtual()) {
+      const MachineRegisterInfo &MRI =
+          MI.getParent()->getParent()->getRegInfo();
       return MRI.getRegClass(Reg);
+    }
     return RI.getPhysRegBaseClass(Reg);
   }
 
   unsigned RCID = Desc.operands()[OpNo].RegClass;
-  return adjustAllocatableRegClass(ST, RI, MRI, Desc, RCID, true);
+  return adjustAllocatableRegClass(ST, RI, Desc, RCID, true);
 }
 
 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
@@ -6224,15 +6273,14 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
         continue;
       const MachineOperand &Op = MI.getOperand(i);
       if (Op.isReg()) {
-        RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
-        if (!SGPRsUsed.count(SGPR) &&
-            // FIXME: This can access off the end of the operands() array.
-            usesConstantBus(MRI, Op, InstDesc.operands().begin()[i])) {
-          if (--ConstantBusLimit <= 0)
-            return false;
-          SGPRsUsed.insert(SGPR);
+        if (Op.isUse()) {
+          RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
+          if (regUsesConstantBus(Op, MRI) && SGPRsUsed.insert(SGPR).second) {
+            if (--ConstantBusLimit <= 0)
+              return false;
+          }
         }
-      } else if (AMDGPU::isSISrcOperand(InstDesc, i) &&
+      } else if (AMDGPU::isSISrcOperand(InstDesc.operands()[i]) &&
                  !isInlineConstant(Op, InstDesc.operands()[i])) {
         // The same literal may be used multiple times.
         if (!UsedLiteral)
@@ -6526,6 +6574,21 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
       !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
     legalizeOpWithMove(MI, VOP3Idx[2]);
 
+  if (isWMMA(MI)) {
+    // scale_src has a register class restricted to low 256 VGPRs, we may need
+    // to insert a copy to the restricted VGPR class.
+    int ScaleSrc0Idx =
+        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::scale_src0);
+    if (ScaleSrc0Idx != -1) {
+      int ScaleSrc1Idx =
+          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::scale_src1);
+      if (!isOperandLegal(MI, ScaleSrc0Idx))
+        legalizeOpWithMove(MI, ScaleSrc0Idx);
+      if (!isOperandLegal(MI, ScaleSrc1Idx))
+        legalizeOpWithMove(MI, ScaleSrc1Idx);
+    }
+  }
+
   // Fix the register class of packed FP32 instructions on gfx12+. See
   // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
   if (AMDGPU::isPackedFP32Inst(Opc) && AMDGPU::isGFX12Plus(ST)) {
@@ -8036,12 +8099,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
       MRI.replaceRegWith(DstReg, NewDstReg);
       MRI.clearKillFlags(NewDstReg);
       Inst.getOperand(0).setReg(DstReg);
-      // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
-      // these are deleted later, but at -O0 it would leave a suspicious
-      // looking illegal copy of an undef register.
-      for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
-        Inst.removeOperand(I);
-      Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
+      Inst.eraseFromParent();
       // Legalize t16 operand since replaceReg is called after addUsersToVALU
       for (MachineOperand &MO :
            make_early_inc_range(MRI.use_operands(NewDstReg))) {
@@ -9235,6 +9293,9 @@ Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
 
 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
                                              AMDGPU::OpName OperandName) const {
+  if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)
+    return nullptr;
+
   int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
   if (Idx == -1)
     return nullptr;
@@ -9532,6 +9593,7 @@ SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const {
       {
           {MONoClobber, "amdgpu-noclobber"},
           {MOLastUse, "amdgpu-last-use"},
+          {MOCooperative, "amdgpu-cooperative"},
       };
 
   return ArrayRef(TargetFlags);
@@ -10219,7 +10281,7 @@ unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
 InstructionUniformity
 SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
   const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
-  unsigned opcode = MI.getOpcode();
+  unsigned Opcode = MI.getOpcode();
 
   auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
     Register Dst = MI.getOperand(0).getReg();
@@ -10239,7 +10301,7 @@ SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
   // If the target supports globally addressable scratch, the mapping from
   // scratch memory to the flat aperture changes therefore an address space cast
   // is no longer uniform.
-  if (opcode == TargetOpcode::G_ADDRSPACE_CAST)
+  if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
     return HandleAddrSpaceCast(MI);
 
   if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
@@ -10267,7 +10329,8 @@ SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
   //
   // All other loads are not divergent, because if threads issue loads with the
   // same arguments, they will always get the same result.
-  if (opcode == AMDGPU::G_LOAD) {
+  if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
+      Opcode == AMDGPU::G_SEXTLOAD) {
     if (MI.memoperands_empty())
       return InstructionUniformity::NeverUniform; // conservative assumption
 
@@ -10281,10 +10344,10 @@ SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
     return InstructionUniformity::Default;
   }
 
-  if (SIInstrInfo::isGenericAtomicRMWOpcode(opcode) ||
-      opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
-      opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
-      AMDGPU::isGenericAtomic(opcode)) {
+  if (SIInstrInfo::isGenericAtomicRMWOpcode(Opcode) ||
+      Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
+      Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
+      AMDGPU::isGenericAtomic(Opcode)) {
     return InstructionUniformity::NeverUniform;
   }
   return InstructionUniformity::Default;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index fdbd9ce4a66b..f7dde2b90b68 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -48,6 +48,10 @@ static const MachineMemOperand::Flags MONoClobber =
 static const MachineMemOperand::Flags MOLastUse =
     MachineMemOperand::MOTargetFlag2;
 
+/// Mark the MMO of cooperative load/store atomics.
+static const MachineMemOperand::Flags MOCooperative =
+    MachineMemOperand::MOTargetFlag3;
+
 /// Utility to store machine instructions worklist.
 struct SIInstrWorklist {
   SIInstrWorklist() = default;
@@ -533,13 +537,13 @@ public:
     return get(Opcode).TSFlags & SIInstrFlags::VOP2;
   }
 
-  static bool isVOP3(const MachineInstr &MI) {
-    return MI.getDesc().TSFlags & SIInstrFlags::VOP3;
+  static bool isVOP3(const MCInstrDesc &Desc) {
+    return Desc.TSFlags & SIInstrFlags::VOP3;
   }
 
-  bool isVOP3(uint16_t Opcode) const {
-    return get(Opcode).TSFlags & SIInstrFlags::VOP3;
-  }
+  static bool isVOP3(const MachineInstr &MI) { return isVOP3(MI.getDesc()); }
+
+  bool isVOP3(uint16_t Opcode) const { return isVOP3(get(Opcode)); }
 
   static bool isSDWA(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::SDWA;
@@ -841,13 +845,13 @@ public:
     return get(Opcode).TSFlags & SIInstrFlags::VINTRP;
   }
 
-  static bool isMAI(const MachineInstr &MI) {
-    return MI.getDesc().TSFlags & SIInstrFlags::IsMAI;
+  static bool isMAI(const MCInstrDesc &Desc) {
+    return Desc.TSFlags & SIInstrFlags::IsMAI;
   }
 
-  bool isMAI(uint16_t Opcode) const {
-    return get(Opcode).TSFlags & SIInstrFlags::IsMAI;
-  }
+  static bool isMAI(const MachineInstr &MI) { return isMAI(MI.getDesc()); }
+
+  bool isMAI(uint16_t Opcode) const { return isMAI(get(Opcode)); }
 
   static bool isMFMA(const MachineInstr &MI) {
     return isMAI(MI) && MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
@@ -983,13 +987,19 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::IsNeverUniform;
   }
 
-  bool isBarrier(unsigned Opcode) const {
+  // Check to see if opcode is for a barrier start. Pre gfx12 this is just the
+  // S_BARRIER, but after support for S_BARRIER_SIGNAL* / S_BARRIER_WAIT we want
+  // to check for the barrier start (S_BARRIER_SIGNAL*)
+  bool isBarrierStart(unsigned Opcode) const {
     return Opcode == AMDGPU::S_BARRIER ||
            Opcode == AMDGPU::S_BARRIER_SIGNAL_M0 ||
            Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0 ||
            Opcode == AMDGPU::S_BARRIER_SIGNAL_IMM ||
-           Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM ||
-           Opcode == AMDGPU::S_BARRIER_WAIT ||
+           Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM;
+  }
+
+  bool isBarrier(unsigned Opcode) const {
+    return isBarrierStart(Opcode) || Opcode == AMDGPU::S_BARRIER_WAIT ||
            Opcode == AMDGPU::S_BARRIER_INIT_M0 ||
            Opcode == AMDGPU::S_BARRIER_INIT_IMM ||
            Opcode == AMDGPU::S_BARRIER_JOIN_IMM ||
@@ -1045,6 +1055,8 @@ public:
       return AMDGPU::S_WAIT_DSCNT;
     case AMDGPU::S_WAIT_KMCNT_soft:
       return AMDGPU::S_WAIT_KMCNT;
+    case AMDGPU::S_WAIT_XCNT_soft:
+      return AMDGPU::S_WAIT_XCNT;
     default:
       return Opcode;
     }
@@ -1174,9 +1186,20 @@ public:
     return isInlineConstant(*MO.getParent(), MO.getOperandNo());
   }
 
-  bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
+  bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
                          const MachineOperand &MO) const;
 
+  bool isLiteralOperandLegal(const MCInstrDesc &InstDesc,
+                             const MCOperandInfo &OpInfo) const;
+
+  bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
+                         int64_t ImmVal) const;
+
+  bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
+                         const MachineOperand &MO) const {
+    return isImmOperandLegal(MI.getDesc(), OpNo, MO);
+  }
+
   /// Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
   bool isLegalAV64PseudoImm(uint64_t Imm) const;
 
@@ -1184,6 +1207,10 @@ public:
   /// This function will return false if you pass it a 32-bit instruction.
   bool hasVALU32BitEncoding(unsigned Opcode) const;
 
+  bool physRegUsesConstantBus(const MachineOperand &Reg) const;
+  bool regUsesConstantBus(const MachineOperand &Reg,
+                          const MachineRegisterInfo &MRI) const;
+
   /// Returns true if this operand uses the constant bus.
   bool usesConstantBus(const MachineRegisterInfo &MRI,
                        const MachineOperand &MO,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 0374526e35c4..aa5dae09ca18 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1806,15 +1806,15 @@ class getVALUDstForVT<ValueType VT, bit IsTrue16 = 0, bit IsVOP3Encoding = 0> {
                                    VOPDstOperand_t16Lo128),
                     VOPDstOperand<VGPR_32>);
   RegisterOperand ret = !cond(!eq(VT.Size, 1024) : VOPDstOperand<VReg_1024>,
-                              !eq(VT.Size, 512) : VOPDstOperand<VReg_512>,
-                              !eq(VT.Size, 256) : VOPDstOperand<VReg_256>,
-                              !eq(VT.Size, 192) : VOPDstOperand<VReg_192>,
-                              !eq(VT.Size, 128) : VOPDstOperand<VReg_128>,
+                              !eq(VT.Size, 512)  : VOPDstOperand<VReg_512>,
+                              !eq(VT.Size, 256)  : VOPDstOperand<VReg_256>,
+                              !eq(VT.Size, 192)  : VOPDstOperand<VReg_192>,
+                              !eq(VT.Size, 128)  : VOPDstOperand<VReg_128>,
                               !eq(VT.Size, 96)   : VOPDstOperand<VReg_96>,
-                              !eq(VT.Size, 64)  : VOPDstOperand<VReg_64>,
-                              !eq(VT.Size, 32)  : VOPDstOperand<VGPR_32>,
-                              !eq(VT.Size, 16)  : op16,
-                              1                 : VOPDstS64orS32); // else VT == i1
+                              !eq(VT.Size, 64)   : VOPDstOperand<VReg_64>,
+                              !eq(VT.Size, 32)   : VOPDstOperand<VGPR_32>,
+                              !eq(VT.Size, 16)   : op16,
+                              1                  : VOPDstS64orS32); // else VT == i1
 }
 
 class getVALUDstForVT_fake16<ValueType VT> {
@@ -1898,7 +1898,7 @@ class getVregSrcForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 1> {
         !eq(VT.Size, 64)  : RegisterOperand<VReg_64>,
         !eq(VT.Size, 48)  : RegisterOperand<VReg_64>,
         !eq(VT.Size, 16)  : !if(IsTrue16,
-                                !if(IsFake16, VGPRSrc_32_Lo128, VGPRSrc_16_Lo128),
+                                !if(IsFake16, VGPROp_32_Lo128, VGPROp_16_Lo128),
                                 RegisterOperand<VGPR_32>),
         1                 : RegisterOperand<VGPR_32>);
 }
@@ -1950,6 +1950,20 @@ class getVOP3VRegSrcForVT<ValueType VT> {
                               1 : VRegSrc_32);
 }
 
+// VGPR only VOP3 src with 8 bit encoding e.g. VOP3DPP src0.
+class getVGPRSrcForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 0> {
+  RegisterOperand ret =
+   !cond(!eq(VT.Size, 128) : VGPROp_128,
+         !eq(VT.Size, 96)  : VGPROp_96,
+         !eq(VT.Size, 64)  : VGPROp_64,
+         !eq(VT.Size, 48)  : VGPROp_64,
+         !eq(VT.Size, 16)  : !if(IsTrue16,
+                                 !if(IsFake16, VGPROp_32,
+                                               VGPROp_16),
+                                 VGPROp_32),
+         1                 : VGPROp_32);
+}
+
 // Src2 of VOP3 DPP instructions cannot be a literal
 class getVOP3DPPSrcForVT<ValueType VT, bit IsFake16 = 1> {
   RegisterOperand ret =
@@ -2578,22 +2592,50 @@ class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
                 getHasSDWA<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret);
 }
 
-// Return an AGPR+VGPR operand class for the given VGPR register class.
-class getLdStRegisterOperand<RegisterClass RC> {
-  // This type of operands is only used in pseudo instructions helping
-  // code generation and thus doesn't need encoding and decoding methods.
-  // It also doesn't need to support AGPRs, because GFX908/A/40 do not
-  // support True16.
-  defvar VLdSt_16 = RegisterOperand<VGPR_16>;
+class getAlign2RegOp<RegisterOperand RC> {
+  RegisterOperand ret =
+    !cond(!eq(RC, VGPROp_16) : VGPROp_16,
+          !eq(RC, VGPROp_32) : VGPROp_32,
+          !eq(RC, VGPROp_64) : VGPROp_64_Align2,
+          !eq(RC, VGPROp_64_Align1) : VGPROp_64_Align2,
+          !eq(RC, VGPROp_96) : VGPROp_96_Align2,
+          !eq(RC, VGPROp_96_Align1) : VGPROp_96_Align2,
+          !eq(RC, VGPROp_128) : VGPROp_128_Align2,
+          !eq(RC, VGPROp_128_Align1) : VGPROp_128_Align2,
+          !eq(RC, VGPROp_160) : VGPROp_160_Align2,
+          !eq(RC, VGPROp_160_Align1) : VGPROp_160_Align2,
+          !eq(RC, VGPROp_1024) : VGPROp_1024_Align2,
+          !eq(RC, VGPROp_1024_Align1) : VGPROp_1024_Align2,
+          !eq(RC, AVLdSt_32) : AVLdSt_32,
+          !eq(RC, AVLdSt_64) : AVLdSt_64_Align2,
+          !eq(RC, AVLdSt_96) : AVLdSt_96_Align2,
+          !eq(RC, AVLdSt_96_Align1) : AVLdSt_96_Align2,
+          !eq(RC, AVLdSt_128) : AVLdSt_128_Align2,
+          !eq(RC, AVLdSt_128_Align1) : AVLdSt_128_Align2,
+          !eq(RC, AVLdSt_160) : AVLdSt_160_Align2,
+          !eq(RC, AVLdSt_160_Align1) : AVLdSt_160_Align2);
+}
+
+class getEquivalentAGPROperand<RegisterOperand RC> {
+  defvar Size = RC.RegClass.Size;
+  RegisterOperand ret =
+    !cond(!eq(Size, 32)   : RegisterOperand<AGPR_32>,
+          !eq(Size, 64)   : RegisterOperand<AReg_64>,
+          !eq(Size, 96)   : RegisterOperand<AReg_96>,
+          !eq(Size, 128)  : RegisterOperand<AReg_128>,
+          !eq(Size, 160)  : RegisterOperand<AReg_160>,
+          !eq(Size, 1024) : RegisterOperand<AReg_1024>);
+}
 
+class getEquivalentVGPROperand<RegisterOperand RC> {
+  defvar Size = RC.RegClass.Size;
   RegisterOperand ret =
-    !cond(!eq(RC.Size, 16)   : VLdSt_16,
-          !eq(RC.Size, 32)   : AVLdSt_32,
-          !eq(RC.Size, 64)   : AVLdSt_64,
-          !eq(RC.Size, 96)   : AVLdSt_96,
-          !eq(RC.Size, 128)  : AVLdSt_128,
-          !eq(RC.Size, 160)  : AVLdSt_160,
-          !eq(RC.Size, 1024) : AVLdSt_1024);
+    !cond(!eq(Size, 32)   : RegisterOperand<VGPR_32>,
+          !eq(Size, 64)   : RegisterOperand<VReg_64>,
+          !eq(Size, 96)   : RegisterOperand<VReg_96>,
+          !eq(Size, 128)  : RegisterOperand<VReg_128>,
+          !eq(Size, 160)  : RegisterOperand<VReg_160>,
+          !eq(Size, 1024) : RegisterOperand<VReg_1024>);
 }
 
 class getHasVOP3DPP <ValueType DstVT = i32, ValueType Src0VT = i32,
@@ -2643,7 +2685,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
   field RegisterOperand Src0DPP = getVregSrcForVT<Src0VT>.ret;
   field RegisterOperand Src1DPP = getVregSrcForVT<Src1VT>.ret;
   field RegisterOperand Src2DPP = getVregSrcForVT<Src2VT>.ret;
-  field RegisterOperand Src0VOP3DPP = VGPRSrc_32;
+  field RegisterOperand Src0VOP3DPP = getVGPRSrcForVT<Src0VT>.ret;
   field RegisterOperand Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT>.ret;
   field RegisterOperand Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT>.ret;
   field RegisterOperand Src0SDWA = getSDWASrcForVT<Src0VT>.ret;
@@ -2859,7 +2901,7 @@ class VOPProfile_True16<VOPProfile P> : VOPProfile<P.ArgVT> {
   let Src0ModDPP = getSrcModDPP_t16<Src0VT, 0 /*IsFake16*/>.ret;
   let Src1ModDPP = getSrcModDPP_t16<Src1VT, 0 /*IsFake16*/>.ret;
   let Src2ModDPP = getSrcModDPP_t16<Src2VT, 0 /*IsFake16*/>.ret;
-  let Src0VOP3DPP = !if (!eq(Src0VT.Size, 16), VGPRSrc_16, VGPRSrc_32);
+  let Src0VOP3DPP = !if (!eq(Src0VT.Size, 16), VGPROp_16, VGPROp_32);
   let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 0 /*IsFake16*/>.ret;
   let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 0 /*IsFake16*/>.ret;
   let Src0ModVOP3DPP = getSrc0ModVOP3DPP<Src0VT, DstVT, 0/*IsFake16*/>.ret;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index e8b450122673..1f7951258c21 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -66,7 +66,7 @@ defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m;
   //     Constraints = "@earlyclobber $vdst", isAsmParserOnly=1
 
 let OtherPredicates = [isNotGFX90APlus] in {
-let DisableEncoding = "$src0", Constraints = "$src0 = $vdst" in {
+let Constraints = "$src0 = $vdst" in {
 
 defm V_INTERP_P2_F32 : VINTRP_m <
   0x00000001,
@@ -77,7 +77,7 @@ defm V_INTERP_P2_F32 : VINTRP_m <
   [(set f32:$vdst, (int_amdgcn_interp_p2 f32:$src0, f32:$vsrc,
                    (i32 timm:$attrchan), (i32 timm:$attr), M0))]>;
 
-} // End DisableEncoding = "$src0", Constraints = "$src0 = $vdst"
+} // End Constraints = "$src0 = $vdst"
 
 defm V_INTERP_MOV_F32 : VINTRP_m <
   0x00000002,
@@ -326,28 +326,57 @@ def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
     (V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;
 
 // clang-format off
-defvar int_amdgcn_wave_reduce_ = "int_amdgcn_wave_reduce_";
+
 multiclass
-    AMDGPUWaveReducePseudoGenerator<string Op, string DataType> {
+    AMDGPUWaveReducePseudoGenerator<string Op, string DataType, ValueType ty, RegisterClass RetReg, SrcRegOrImm9 Reg> {
   let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
     def !toupper(Op) #"_PSEUDO_" #DataType
-        : VPseudoInstSI<(outs SGPR_32 : $sdst),
-                        (ins VSrc_b32 : $src, VSrc_b32 : $strategy),
-                        [(set i32 : $sdst, (!cast<AMDGPUWaveReduce>(int_amdgcn_wave_reduce_ #Op) i32 : $src, i32 : $strategy))]> {}
+        : VPseudoInstSI<(outs RetReg : $sdst),
+                        (ins Reg : $src, VSrc_b32 : $strategy),
+                        [(set ty : $sdst, (!cast<AMDGPUWaveReduce>("int_amdgcn_wave_reduce_" #Op) ty : $src, i32 : $strategy))]> {}
   }
 }
 // clang-format on
 
+class WaveReduceOp<string OpName, string TypeStr, ValueType Ty,
+                   RegisterClass ReturnRegisterClass, SrcRegOrImm9 RC> {
+  string Name = OpName;
+  string TypeString = TypeStr;
+  ValueType VT = Ty;
+  RegisterClass RetReg = ReturnRegisterClass;
+  SrcRegOrImm9 Reg = RC;
+}
+
 // Input list : [Operation_name,
-//              type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B)]
+//              type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B),
+//              bit-width
+//              output register class,
+//              input register class]
 defvar Operations = [
-  ["umin", "U32"], ["min", "I32"], ["umax", "U32"], ["max", "I32"],
-  ["add", "I32"], ["sub", "I32"], ["and", "B32"], ["or", "B32"],
-  ["xor", "B32"]
+  WaveReduceOp<"umin", "U32", i32, SGPR_32, VSrc_b32>,
+  WaveReduceOp<"min", "I32", i32, SGPR_32, VSrc_b32>,
+  WaveReduceOp<"umax", "U32", i32, SGPR_32, VSrc_b32>,
+  WaveReduceOp<"max", "I32", i32, SGPR_32, VSrc_b32>,
+  WaveReduceOp<"add", "I32", i32, SGPR_32, VSrc_b32>,
+  WaveReduceOp<"sub", "I32", i32, SGPR_32, VSrc_b32>,
+  WaveReduceOp<"and", "B32", i32, SGPR_32, VSrc_b32>,
+  WaveReduceOp<"or", "B32", i32, SGPR_32, VSrc_b32>,
+  WaveReduceOp<"xor", "B32", i32, SGPR_32, VSrc_b32>,
+
+  WaveReduceOp<"umin", "U64", i64, SGPR_64, VSrc_b64>,
+  WaveReduceOp<"min", "I64", i64, SGPR_64, VSrc_b64>,
+  WaveReduceOp<"umax", "U64", i64, SGPR_64, VSrc_b64>,
+  WaveReduceOp<"max", "I64", i64, SGPR_64, VSrc_b64>,
+  WaveReduceOp<"add", "U64", i64, SGPR_64, VSrc_b64>,
+  WaveReduceOp<"sub", "U64", i64, SGPR_64, VSrc_b64>,
+  WaveReduceOp<"and", "B64", i64, SGPR_64, VSrc_b64>,
+  WaveReduceOp<"or", "B64", i64, SGPR_64, VSrc_b64>,
+  WaveReduceOp<"xor", "B64", i64, SGPR_64, VSrc_b64>,
 ];
 
 foreach Op = Operations in {
-  defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<Op[0], Op[1]>;
+  defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<Op.Name, Op.TypeString,
+                                                      Op.VT, Op.RetReg, Op.Reg>;
 }
 
 let usesCustomInserter = 1, Defs = [VCC] in {
@@ -692,6 +721,33 @@ def SI_WHOLE_WAVE_FUNC_RETURN : SPseudoInstSI <
 def : GCNPat<
   (AMDGPUwhole_wave_return), (SI_WHOLE_WAVE_FUNC_RETURN (i1 (IMPLICIT_DEF)))>;
 
+// Restores the previous EXEC and otherwise behaves entirely like a SI_TCRETURN.
+// This is used for tail calls *from* a whole wave function. Tail calls to
+// a whole wave function may use the usual opcodes, depending on the calling
+// convention of the caller.
+def SI_TCRETURN_GFX_WholeWave : SPseudoInstSI <
+  (outs),
+  (ins SReg_1:$orig_exec, Gfx_CCR_SGPR_64:$src0, unknown:$callee, i32imm:$fpdiff)> {
+  let isCall = 1;
+  let isTerminator = 1;
+  let isReturn = 1;
+  let isBarrier = 1;
+  let UseNamedOperandTable = 1;
+  let SchedRW = [WriteBranch];
+  let isConvergent = 1;
+
+  // We're going to use custom handling to set the $orig_exec to the correct value.
+  let usesCustomInserter = 1;
+}
+
+// Generate a SI_TCRETURN_GFX_WholeWave pseudo with a placeholder for its
+// argument. It will be filled in by the custom inserter.
+def : GCNPat<
+  (AMDGPUtc_return_gfx_ww i64:$src0, tglobaladdr:$callee, i32:$fpdiff),
+  (SI_TCRETURN_GFX_WholeWave (i1 (IMPLICIT_DEF)), Gfx_CCR_SGPR_64:$src0,
+   tglobaladdr:$callee, i32:$fpdiff)>;
+
+
 // Return for returning shaders to a shader variant epilog.
 def SI_RETURN_TO_EPILOG : SPseudoInstSI <
   (outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {
@@ -2174,7 +2230,8 @@ def : GCNPat <
 }
 
 foreach fp16vt = [f16, bf16] in {
-
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
 def : GCNPat <
   (fcopysign fp16vt:$src0, fp16vt:$src1),
   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
@@ -2205,6 +2262,42 @@ def : GCNPat <
   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0,
              (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1)))
 >;
+}
+let True16Predicate = UseRealTrue16Insts in {
+def : GCNPat <
+  (fcopysign fp16vt:$src0, fp16vt:$src1),
+  (EXTRACT_SUBREG (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)),
+    (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16),
+    (REG_SEQUENCE VGPR_32, $src1, lo16, (i16 (IMPLICIT_DEF)), hi16)), lo16)
+>;
+
+def : GCNPat <
+  (fcopysign f32:$src0, fp16vt:$src1),
+  (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0,
+             (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, $src1, hi16))
+>;
+
+def : GCNPat <
+  (fcopysign f64:$src0, fp16vt:$src1),
+  (REG_SEQUENCE VReg_64,
+    (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
+    (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)),
+               (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, $src1, hi16)), sub1)
+>;
+
+def : GCNPat <
+  (fcopysign fp16vt:$src0, f32:$src1),
+  (EXTRACT_SUBREG (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fff0000)),
+             (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, $src0, hi16), $src1), hi16)
+>;
+
+def : GCNPat <
+  (fcopysign fp16vt:$src0, f64:$src1),
+  (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)),
+             (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16),
+             (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1)))
+>;
+}
 } // End foreach fp16vt = [f16, bf16]
 
 
@@ -2480,6 +2573,38 @@ def : AMDGPUPatIgnoreCopies <
               (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1)
 >;
 
+// (z & ~x)
+def : AMDGPUPatIgnoreCopies <
+  (DivergentBinFrag<and> i32:$z, (not_oneuse i32:$x)),
+  (V_BFI_B32_e64 VSrc_b32:$x, (i32 0), VSrc_b32:$z)
+>;
+
+// 64-bit version
+def : AMDGPUPatIgnoreCopies <
+  (DivergentBinFrag<and> i64:$z, (not_oneuse i64:$x)),
+  (REG_SEQUENCE VReg_64,
+    (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), (i32 0),
+                   (i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0,
+    (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)), (i32 0),
+                   (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1)
+>;
+
+// (y | ~x)
+def : AMDGPUPatIgnoreCopies <
+  (DivergentBinFrag<or> i32:$y, (not_oneuse i32:$x)),
+  (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, (i32 -1))
+>;
+
+// 64-bit version
+def : AMDGPUPatIgnoreCopies <
+  (DivergentBinFrag<or> i64:$y, (not_oneuse i64:$x)),
+  (REG_SEQUENCE VReg_64,
+    (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)),
+                   (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)), (i32 -1)), sub0,
+    (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)),
+                   (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)), (i32 -1)), sub1)
+>;
+
 // SHA-256 Ch function
 // z ^ (x & (y ^ z))
 def : AMDGPUPatIgnoreCopies <
@@ -3096,6 +3221,11 @@ def : GCNPat<
   (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))),
   (COPY VSrc_b16:$src)
 >;
+
+def : GCNPat <
+  (i1 (DivergentUnaryFrag<trunc> i16:$a)),
+  (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
+>;
 }
 
 let True16Predicate = UseRealTrue16Insts in {
@@ -3106,15 +3236,18 @@ def : GCNPat<
 
 def : GCNPat<
   (i64 (DivergentUnaryFrag<zext> i16:$src)),
-  (REG_SEQUENCE VReg_64,
-    (INSERT_SUBREG (i32 (V_MOV_B32_e32 (i32 0))), VGPR_16:$src, lo16), sub0,
-    (S_MOV_B32 (i32 0)), sub1)
+  (REG_SEQUENCE VReg_64, $src, lo16, (V_MOV_B16_t16_e64 0, (i16 0), 0), hi16, (V_MOV_B32_e32 (i32 0)), sub1)
 >;
 
 def : GCNPat<
   (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))),
   (REG_SEQUENCE VGPR_32, VGPR_16:$src, lo16, (V_MOV_B16_t16_e64 0, (i16 0), 0), hi16)
 >;
+
+def : GCNPat <
+  (i1 (DivergentUnaryFrag<trunc> i16:$a)),
+  (V_CMP_EQ_U16_t16_e64 (i32 0), (V_AND_B16_t16_e64 (i32 0), (i16 1), (i32 0), $a), (i32 0), (i16 1), (i32 0))
+>;
 }
 
 def : GCNPat <
@@ -3143,11 +3276,6 @@ def : GCNPat <
   (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
 >;
 
-def : GCNPat <
-  (i1 (DivergentUnaryFrag<trunc> i16:$a)),
-  (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
->;
-
 def IMMBitSelConst : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(1ULL << N->getZExtValue(), SDLoc(N),
                                    MVT::i32);
@@ -3637,13 +3765,24 @@ def : GCNPat <
 >;
 
 foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
-let True16Predicate = p in
+let True16Predicate = p in {
 // Take the lower 16 bits from each VGPR_32 and concat them
 def : GCNPat <
   (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a), (Ty VGPR_32:$b))),
   (V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x05040100)))
 >;
 
+// Take the lower 16 bits from V[0] and the upper 16 bits from V[1]
+// Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000)
+def : GCNPat <
+  (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a),
+    (Ty !if(!eq(Ty, i16),
+      (Ty (trunc (srl VGPR_32:$b, (i32 16)))),
+      (Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))),
+  (V_BFI_B32_e64 (S_MOV_B32 (i32 0x0000ffff)),  VGPR_32:$a, VGPR_32:$b)
+>;
+}
+
 let True16Predicate = UseRealTrue16Insts in {
 def : GCNPat <
   (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_16:$a), (Ty VGPR_16:$b))),
@@ -3669,18 +3808,6 @@ def : GCNPat <
   (V_AND_B32_e64 (S_MOV_B32 (i32 0xffff0000)), VGPR_32:$b)
 >;
 
-
-// Take the lower 16 bits from V[0] and the upper 16 bits from V[1]
-// Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000)
-def : GCNPat <
-  (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a),
-    (Ty !if(!eq(Ty, i16),
-      (Ty (trunc (srl VGPR_32:$b, (i32 16)))),
-      (Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))),
-  (V_BFI_B32_e64 (S_MOV_B32 (i32 0x0000ffff)),  VGPR_32:$a, VGPR_32:$b)
->;
-
-
 // Take the upper 16 bits from V[0] and the lower 16 bits from V[1]
 // Special case, can use V_ALIGNBIT (always uses encoded literal)
 let True16Predicate = NotHasTrue16BitInsts in {
@@ -3752,7 +3879,8 @@ def : GCNPat <
   (v2i16 (S_PACK_HL_B32_B16 SReg_32:$src0, SReg_32:$src1))
 >;
 
-
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
 def : GCNPat <
   (v2f16 (scalar_to_vector f16:$src0)),
   (COPY $src0)
@@ -3772,6 +3900,29 @@ def : GCNPat <
   (v4f16 (scalar_to_vector f16:$src0)),
   (INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)
 >;
+}
+
+let True16Predicate = UseRealTrue16Insts in {
+def : GCNPat <
+  (v2f16 (scalar_to_vector f16:$src0)),
+  (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16)
+>;
+
+def : GCNPat <
+  (v2i16 (scalar_to_vector i16:$src0)),
+  (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16)
+>;
+
+def : GCNPat <
+  (v4i16 (scalar_to_vector i16:$src0)),
+  (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16, (i32 (IMPLICIT_DEF)), sub1)
+>;
+
+def : GCNPat <
+  (v4f16 (scalar_to_vector f16:$src0)),
+  (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16, (i32 (IMPLICIT_DEF)), sub1)
+>;
+}
 
 def : GCNPat <
   (i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask,
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 6f2ea8ad1ff0..69d02e7c2934 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -119,7 +119,7 @@ class SILoadStoreOptimizer {
     unsigned DMask;
     InstClassEnum InstClass;
     unsigned CPol = 0;
-    bool IsAGPR;
+    const TargetRegisterClass *DataRC;
     bool UseST64;
     int AddrIdx[MaxAddressRegs];
     const MachineOperand *AddrReg[MaxAddressRegs];
@@ -203,6 +203,7 @@ class SILoadStoreOptimizer {
   using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
 
 private:
+  MachineFunction *MF = nullptr;
   const GCNSubtarget *STM = nullptr;
   const SIInstrInfo *TII = nullptr;
   const SIRegisterInfo *TRI = nullptr;
@@ -245,6 +246,8 @@ private:
 
   unsigned write2Opcode(unsigned EltSize) const;
   unsigned write2ST64Opcode(unsigned EltSize) const;
+  unsigned getWrite2Opcode(const CombineInfo &CI) const;
+
   MachineBasicBlock::iterator
   mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
                   MachineBasicBlock::iterator InsertBefore);
@@ -846,7 +849,7 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
   if (InstClass == UNKNOWN)
     return;
 
-  IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
+  DataRC = LSO.getDataRegClass(*MI);
 
   switch (InstClass) {
   case DS_READ:
@@ -1313,6 +1316,50 @@ SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
   // have already been confirmed to be mergeable.
   if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
     offsetsCanBeCombined(CI, *STM, Paired, true);
+
+  if (CI.InstClass == DS_WRITE) {
+    // Both data operands must be AGPR or VGPR, so the data registers needs to
+    // be constrained to one or the other. We expect to only emit the VGPR form
+    // here for now.
+    //
+    // FIXME: There is currently a hack in getRegClass to report that the write2
+    // operands are VGPRs. In the future we should have separate agpr
+    // instruction definitions.
+    const MachineOperand *Data0 =
+        TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
+    const MachineOperand *Data1 =
+        TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
+
+    const MCInstrDesc &Write2Opc = TII->get(getWrite2Opcode(CI));
+    int Data0Idx = AMDGPU::getNamedOperandIdx(Write2Opc.getOpcode(),
+                                              AMDGPU::OpName::data0);
+    int Data1Idx = AMDGPU::getNamedOperandIdx(Write2Opc.getOpcode(),
+                                              AMDGPU::OpName::data1);
+
+    const TargetRegisterClass *DataRC0 =
+        TII->getRegClass(Write2Opc, Data0Idx, TRI, *MF);
+
+    const TargetRegisterClass *DataRC1 =
+        TII->getRegClass(Write2Opc, Data1Idx, TRI, *MF);
+
+    if (unsigned SubReg = Data0->getSubReg()) {
+      DataRC0 = TRI->getMatchingSuperRegClass(MRI->getRegClass(Data0->getReg()),
+                                              DataRC0, SubReg);
+    }
+
+    if (unsigned SubReg = Data1->getSubReg()) {
+      DataRC1 = TRI->getMatchingSuperRegClass(MRI->getRegClass(Data1->getReg()),
+                                              DataRC1, SubReg);
+    }
+
+    if (!MRI->constrainRegClass(Data0->getReg(), DataRC0) ||
+        !MRI->constrainRegClass(Data1->getReg(), DataRC1))
+      return nullptr;
+
+    // TODO: If one register can be constrained, and not the other, insert a
+    // copy.
+  }
+
   return Where;
 }
 
@@ -1462,6 +1509,10 @@ unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
                         : AMDGPU::DS_WRITE2ST64_B64_gfx9;
 }
 
+unsigned SILoadStoreOptimizer::getWrite2Opcode(const CombineInfo &CI) const {
+  return CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
+}
+
 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
     CombineInfo &CI, CombineInfo &Paired,
     MachineBasicBlock::iterator InsertBefore) {
@@ -1478,8 +1529,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
 
   unsigned NewOffset0 = CI.Offset;
   unsigned NewOffset1 = Paired.Offset;
-  unsigned Opc =
-      CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
+  unsigned Opc = getWrite2Opcode(CI);
 
   if (NewOffset0 > NewOffset1) {
     // Canonicalize the merged instruction so the smaller offset comes first.
@@ -2032,6 +2082,8 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
     }
   }
 
+  // FIXME: This should compute the instruction to use, and then use the result
+  // of TII->getRegClass.
   unsigned BitWidth = 32 * (CI.Width + Paired.Width);
   return TRI->isAGPRClass(getDataRegClass(*CI.I))
              ? TRI->getAGPRClassForBitWidth(BitWidth)
@@ -2400,7 +2452,6 @@ void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
                  std::list<std::list<CombineInfo> > &MergeableInsts) const {
   for (std::list<CombineInfo> &AddrList : MergeableInsts) {
     if (AddrList.front().InstClass == CI.InstClass &&
-        AddrList.front().IsAGPR == CI.IsAGPR &&
         AddrList.front().hasSameBaseAddress(CI)) {
       AddrList.emplace_back(CI);
       return;
@@ -2465,16 +2516,6 @@ SILoadStoreOptimizer::collectMergeableInsts(
     if (!CI.hasMergeableAddress(*MRI))
       continue;
 
-    if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
-      // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
-      //        operands. However we are reporting that ds_write2 shall have
-      //        only VGPR data so that machine copy propagation does not
-      //        create an illegal instruction with a VGPR and AGPR sources.
-      //        Consequenctially if we create such instruction the verifier
-      //        will complain.
-      continue;
-    }
-
     LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
 
     addInstToMergeableList(CI, MergeableInsts);
@@ -2647,6 +2688,7 @@ bool SILoadStoreOptimizerLegacy::runOnMachineFunction(MachineFunction &MF) {
 }
 
 bool SILoadStoreOptimizer::run(MachineFunction &MF) {
+  this->MF = &MF;
   STM = &MF.getSubtarget<GCNSubtarget>();
   if (!STM->loadStoreOptEnabled())
     return false;
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 53f554eccb1f..1637c06936f9 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -63,6 +63,7 @@ enum class SIAtomicScope {
   SINGLETHREAD,
   WAVEFRONT,
   WORKGROUP,
+  CLUSTER, // Promoted to AGENT on targets without workgroup clusters.
   AGENT,
   SYSTEM
 };
@@ -103,8 +104,10 @@ private:
   bool IsVolatile = false;
   bool IsNonTemporal = false;
   bool IsLastUse = false;
+  bool IsCooperative = false;
 
   SIMemOpInfo(
+      const GCNSubtarget &ST,
       AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
       SIAtomicScope Scope = SIAtomicScope::SYSTEM,
       SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
@@ -112,14 +115,15 @@ private:
       bool IsCrossAddressSpaceOrdering = true,
       AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
       bool IsVolatile = false, bool IsNonTemporal = false,
-      bool IsLastUse = false)
+      bool IsLastUse = false, bool IsCooperative = false)
       : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
         OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
         IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
         IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal),
-        IsLastUse(IsLastUse) {
+        IsLastUse(IsLastUse), IsCooperative(IsCooperative) {
 
     if (Ordering == AtomicOrdering::NotAtomic) {
+      assert(!IsCooperative && "Cannot be cooperative & non-atomic!");
       assert(Scope == SIAtomicScope::NONE &&
              OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
              !IsCrossAddressSpaceOrdering &&
@@ -154,6 +158,11 @@ private:
                   SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
       this->Scope = std::min(Scope, SIAtomicScope::AGENT);
     }
+
+    // On targets that have no concept of a workgroup cluster, use
+    // AGENT scope as a conservatively correct alternative.
+    if (this->Scope == SIAtomicScope::CLUSTER && !ST.hasClusters())
+      this->Scope = SIAtomicScope::AGENT;
   }
 
 public:
@@ -209,6 +218,9 @@ public:
   /// create this SIMemOpInfo is last use, false otherwise.
   bool isLastUse() const { return IsLastUse; }
 
+  /// \returns True if this is a cooperative load or store atomic.
+  bool isCooperative() const { return IsCooperative; }
+
   /// \returns True if ordering constraint of the machine instruction used to
   /// create this SIMemOpInfo is unordered or higher, false otherwise.
   bool isAtomic() const {
@@ -220,6 +232,7 @@ public:
 class SIMemOpAccess final {
 private:
   const AMDGPUMachineModuleInfo *MMI = nullptr;
+  const GCNSubtarget &ST;
 
   /// Reports unsupported message \p Msg for \p MI to LLVM context.
   void reportUnsupported(const MachineBasicBlock::iterator &MI,
@@ -243,7 +256,7 @@ private:
 public:
   /// Construct class to support accessing the machine memory operands
   /// of instructions in the machine function \p MF.
-  SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI);
+  SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI, const GCNSubtarget &ST);
 
   /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
   std::optional<SIMemOpInfo>
@@ -325,6 +338,12 @@ public:
     return false;
   };
 
+  /// Handle cooperative load/store atomics.
+  virtual bool handleCooperativeAtomic(MachineInstr &MI) const {
+    llvm_unreachable(
+        "cooperative atomics are not available on this architecture");
+  }
+
   /// Inserts any necessary instructions at position \p Pos relative
   /// to instruction \p MI to ensure memory instructions before \p Pos of kind
   /// \p Op associated with address spaces \p AddrSpace have completed. Used
@@ -359,6 +378,12 @@ public:
                              bool IsCrossAddrSpaceOrdering,
                              Position Pos) const = 0;
 
+  /// Inserts any necessary instructions before the barrier start instruction
+  /// \p MI in order to support pairing of barriers and fences.
+  virtual bool insertBarrierStart(MachineBasicBlock::iterator &MI) const {
+    return false;
+  };
+
   /// Virtual destructor to allow derivations to be deleted.
   virtual ~SICacheControl() = default;
 };
@@ -547,6 +572,8 @@ public:
                      SIAtomicScope Scope,
                      SIAtomicAddrSpace AddrSpace,
                      Position Pos) const override;
+
+  bool insertBarrierStart(MachineBasicBlock::iterator &MI) const override;
 };
 
 class SIGfx11CacheControl : public SIGfx10CacheControl {
@@ -587,7 +614,11 @@ protected:
                       SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
 
 public:
-  SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
+  SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {
+    // GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases
+    // the behavior is the same if assuming GFX12.0 in CU mode.
+    assert(!ST.hasGFX1250Insts() || ST.isCuModeEnabled());
+  }
 
   bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
                   SIAtomicAddrSpace AddrSpace, SIMemOp Op,
@@ -604,6 +635,8 @@ public:
 
   bool finalizeStore(MachineInstr &MI, bool Atomic) const override;
 
+  virtual bool handleCooperativeAtomic(MachineInstr &MI) const override;
+
   bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
                      SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
                      Position Pos) const override;
@@ -748,6 +781,8 @@ SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
     return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
   if (SSID == MMI->getAgentSSID())
     return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
+  if (SSID == MMI->getClusterSSID())
+    return std::tuple(SIAtomicScope::CLUSTER, SIAtomicAddrSpace::ATOMIC, true);
   if (SSID == MMI->getWorkgroupSSID())
     return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
                       true);
@@ -763,6 +798,9 @@ SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
   if (SSID == MMI->getAgentOneAddressSpaceSSID())
     return std::tuple(SIAtomicScope::AGENT,
                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
+  if (SSID == MMI->getClusterOneAddressSpaceSSID())
+    return std::tuple(SIAtomicScope::CLUSTER,
+                      SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
   if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
     return std::tuple(SIAtomicScope::WORKGROUP,
                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
@@ -790,8 +828,9 @@ SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
   return SIAtomicAddrSpace::OTHER;
 }
 
-SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_)
-    : MMI(&MMI_) {}
+SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_,
+                             const GCNSubtarget &ST)
+    : MMI(&MMI_), ST(ST) {}
 
 std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
     const MachineBasicBlock::iterator &MI) const {
@@ -804,6 +843,7 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
   bool IsNonTemporal = true;
   bool IsVolatile = false;
   bool IsLastUse = false;
+  bool IsCooperative = false;
 
   // Validator should check whether or not MMOs cover the entire set of
   // locations accessed by the memory instruction.
@@ -811,6 +851,7 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
     IsNonTemporal &= MMO->isNonTemporal();
     IsVolatile |= MMO->isVolatile();
     IsLastUse |= MMO->getFlags() & MOLastUse;
+    IsCooperative |= MMO->getFlags() & MOCooperative;
     InstrAddrSpace |=
       toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
     AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
@@ -850,9 +891,9 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
       return std::nullopt;
     }
   }
-  return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
+  return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
                      IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
-                     IsNonTemporal, IsLastUse);
+                     IsNonTemporal, IsLastUse, IsCooperative);
 }
 
 std::optional<SIMemOpInfo>
@@ -864,7 +905,7 @@ SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
 
   // Be conservative if there are no memory operands.
   if (MI->getNumMemOperands() == 0)
-    return SIMemOpInfo();
+    return SIMemOpInfo(ST);
 
   return constructFromMIWithMMO(MI);
 }
@@ -878,7 +919,7 @@ SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
 
   // Be conservative if there are no memory operands.
   if (MI->getNumMemOperands() == 0)
-    return SIMemOpInfo();
+    return SIMemOpInfo(ST);
 
   return constructFromMIWithMMO(MI);
 }
@@ -919,8 +960,9 @@ SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
   if (SynchronizeAS)
     OrderingAddrSpace = *SynchronizeAS;
 
-  return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
-                     IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
+  return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace,
+                     SIAtomicAddrSpace::ATOMIC, IsCrossAddressSpaceOrdering,
+                     AtomicOrdering::NotAtomic);
 }
 
 std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
@@ -932,7 +974,7 @@ std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
 
   // Be conservative if there are no memory operands.
   if (MI->getNumMemOperands() == 0)
-    return SIMemOpInfo();
+    return SIMemOpInfo(ST);
 
   return constructFromMIWithMMO(MI);
 }
@@ -2169,6 +2211,22 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
   return Changed;
 }
 
+bool SIGfx10CacheControl::insertBarrierStart(
+    MachineBasicBlock::iterator &MI) const {
+  // We need to wait on vm_vsrc so barriers can pair with fences in GFX10+ CU
+  // mode. This is because a CU mode release fence does not emit any wait, which
+  // is fine when only dealing with vmem, but isn't sufficient in the presence
+  // of barriers which do not go through vmem.
+  // GFX12.5 does not require this additional wait.
+  if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts())
+    return false;
+
+  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+          TII->get(AMDGPU::S_WAITCNT_DEPCTR))
+      .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
+  return true;
+}
+
 bool SIGfx11CacheControl::enableLoadCacheBypass(
     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
     SIAtomicAddrSpace AddrSpace) const {
@@ -2334,18 +2392,23 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
     case SIAtomicScope::AGENT:
+    case SIAtomicScope::CLUSTER:
       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
         LOADCnt |= true;
       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
         STORECnt |= true;
       break;
     case SIAtomicScope::WORKGROUP:
-      // In WGP mode the waves of a work-group can be executing on either CU of
-      // the WGP. Therefore need to wait for operations to complete to ensure
-      // they are visible to waves in the other CU as the L0 is per CU.
-      // Otherwise in CU mode and all waves of a work-group are on the same CU
-      // which shares the same L0.
-      if (!ST.isCuModeEnabled()) {
+      // GFX12.0:
+      //   In WGP mode the waves of a work-group can be executing on either CU
+      //   of the WGP. Therefore need to wait for operations to complete to
+      //   ensure they are visible to waves in the other CU as the L0 is per CU.
+      //   Otherwise in CU mode and all waves of a work-group are on the same CU
+      //   which shares the same L0.
+      //
+      // GFX12.5:
+      //   TODO DOCS
+      if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) {
         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
           LOADCnt |= true;
         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
@@ -2366,6 +2429,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
     case SIAtomicScope::AGENT:
+    case SIAtomicScope::CLUSTER:
     case SIAtomicScope::WORKGROUP:
       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
       // not needed as LDS operations for all waves are executed in a total
@@ -2397,7 +2461,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
     //
     // This also applies to fences. Fences cannot pair with an instruction
     // tracked with bvh/samplecnt as we don't have any atomics that do that.
-    if (Order != AtomicOrdering::Acquire) {
+    if (Order != AtomicOrdering::Acquire && ST.hasImageInsts()) {
       BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
       BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
     }
@@ -2448,11 +2512,18 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
   case SIAtomicScope::AGENT:
     ScopeImm = AMDGPU::CPol::SCOPE_DEV;
     break;
+  case SIAtomicScope::CLUSTER:
+    ScopeImm = AMDGPU::CPol::SCOPE_SE;
+    break;
   case SIAtomicScope::WORKGROUP:
-    // In WGP mode the waves of a work-group can be executing on either CU of
-    // the WGP. Therefore we need to invalidate the L0 which is per CU.
-    // Otherwise in CU mode all waves of a work-group are on the same CU, and so
-    // the L0 does not need to be invalidated.
+    // GFX12.0:
+    //  In WGP mode the waves of a work-group can be executing on either CU of
+    //  the WGP. Therefore we need to invalidate the L0 which is per CU.
+    //  Otherwise in CU mode all waves of a work-group are on the same CU, and
+    //  so the L0 does not need to be invalidated.
+    //
+    // GFX12.5
+    //   TODO DOCS
     if (ST.isCuModeEnabled())
       return false;
 
@@ -2497,7 +2568,8 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
   if (Pos == Position::AFTER)
     ++MI;
 
-  // global_wb is only necessary at system scope for gfx120x targets.
+  // global_wb is only necessary at system scope for GFX12.0,
+  // they're also necessary at device scope for GFX12.5.
   //
   // Emitting it for lower scopes is a slow no-op, so we omit it
   // for performance.
@@ -2507,6 +2579,13 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
         .addImm(AMDGPU::CPol::SCOPE_SYS);
     break;
   case SIAtomicScope::AGENT:
+    // TODO DOCS
+    if (ST.hasGFX1250Insts()) {
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
+          .addImm(AMDGPU::CPol::SCOPE_DEV);
+    }
+    break;
+  case SIAtomicScope::CLUSTER:
   case SIAtomicScope::WORKGROUP:
     // No WB necessary, but we still have to wait.
     break;
@@ -2569,26 +2648,44 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
 }
 
 bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
-  MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
-  if (!CPol)
-    return false;
+  assert(MI.mayStore() && "Not a Store inst");
+  const bool IsRMW = (MI.mayLoad() && MI.mayStore());
+  bool Changed = false;
 
+  // GFX12.5 only: xcnt wait is needed before flat and global atomics
+  // stores/rmw.
+  if (Atomic && ST.requiresWaitXCntBeforeAtomicStores() && TII->isFLAT(MI)) {
+    MachineBasicBlock &MBB = *MI.getParent();
+    BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
+    Changed = true;
+  }
+
+  // Remaining fixes do not apply to RMWs.
+  if (IsRMW)
+    return Changed;
+
+  MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
+  if (!CPol) // Some vmem operations do not have a scope and are not concerned.
+    return Changed;
   const unsigned Scope = CPol->getImm() & CPol::SCOPE;
 
   // GFX12.0 only: Extra waits needed before system scope stores.
-  if (!ST.hasGFX1250Insts()) {
-    if (!Atomic && Scope == CPol::SCOPE_SYS)
-      return insertWaitsBeforeSystemScopeStore(MI);
-    return false;
-  }
+  if (!ST.hasGFX1250Insts() && !Atomic && Scope == CPol::SCOPE_SYS)
+    Changed |= insertWaitsBeforeSystemScopeStore(MI.getIterator());
 
-  // GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address
-  // space.
-  // We also require SCOPE_SE minimum if we not have the "cu-stores" feature.
-  if (Scope == CPol::SCOPE_CU &&
-      (!ST.hasCUStores() || TII->mayAccessScratchThroughFlat(MI)))
-    return setScope(MI, CPol::SCOPE_SE);
+  return Changed;
+}
 
+bool SIGfx12CacheControl::handleCooperativeAtomic(MachineInstr &MI) const {
+  if (!ST.hasGFX1250Insts())
+    return false;
+
+  // Cooperative atomics need to be SCOPE_DEV or higher.
+  MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
+  assert(CPol && "No CPol operand?");
+  const unsigned Scope = CPol->getImm() & CPol::SCOPE;
+  if (Scope < CPol::SCOPE_DEV)
+    return setScope(MI, CPol::SCOPE_DEV);
   return false;
 }
 
@@ -2605,6 +2702,9 @@ bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI,
     case SIAtomicScope::AGENT:
       Changed |= setScope(MI, AMDGPU::CPol::SCOPE_DEV);
       break;
+    case SIAtomicScope::CLUSTER:
+      Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE);
+      break;
     case SIAtomicScope::WORKGROUP:
       // In workgroup mode, SCOPE_SE is needed as waves can executes on
       // different CUs that access different L0s.
@@ -2656,6 +2756,11 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
                                            MOI.getOrderingAddrSpace());
     }
 
+    // Handle cooperative atomics after cache bypass step, as it may override
+    // the scope of the instruction to a greater scope.
+    if (MOI.isCooperative())
+      Changed |= CC->handleCooperativeAtomic(*MI);
+
     if (Order == AtomicOrdering::SequentiallyConsistent)
       Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
                                 SIMemOp::LOAD | SIMemOp::STORE,
@@ -2701,6 +2806,11 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
                                             MOI.getOrderingAddrSpace());
     }
 
+    // Handle cooperative atomics after cache bypass step, as it may override
+    // the scope of the instruction to a greater scope.
+    if (MOI.isCooperative())
+      Changed |= CC->handleCooperativeAtomic(*MI);
+
     if (MOI.getOrdering() == AtomicOrdering::Release ||
         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
       Changed |= CC->insertRelease(MI, MOI.getScope(),
@@ -2778,6 +2888,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
   assert(MI->mayLoad() && MI->mayStore());
 
   bool Changed = false;
+  MachineInstr &RMWMI = *MI;
 
   if (MOI.isAtomic()) {
     const AtomicOrdering Order = MOI.getOrdering();
@@ -2812,6 +2923,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
                                    Position::AFTER);
     }
 
+    Changed |= CC->finalizeStore(RMWMI, /*Atomic=*/true);
     return Changed;
   }
 
@@ -2839,8 +2951,9 @@ SIMemoryLegalizerPass::run(MachineFunction &MF,
 bool SIMemoryLegalizer::run(MachineFunction &MF) {
   bool Changed = false;
 
-  SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>());
-  CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>(), ST);
+  CC = SICacheControl::create(ST);
 
   for (auto &MBB : MF) {
     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
@@ -2860,6 +2973,11 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) {
         MI = II->getIterator();
       }
 
+      if (ST.getInstrInfo()->isBarrierStart(MI->getOpcode())) {
+        Changed |= CC->insertBarrierStart(MI);
+        continue;
+      }
+
       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
         continue;
 
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index d0cba30a442b..857cb91a977f 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -291,21 +291,7 @@ static MachineOperand *findSingleRegUse(const MachineOperand *Reg,
   if (!Reg->isReg() || !Reg->isDef())
     return nullptr;
 
-  MachineOperand *ResMO = nullptr;
-  for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg->getReg())) {
-    // If there exist use of subreg of Reg then return nullptr
-    if (!isSameReg(UseMO, *Reg))
-      return nullptr;
-
-    // Check that there is only one instruction that uses Reg
-    if (!ResMO) {
-      ResMO = &UseMO;
-    } else if (ResMO->getParent() != UseMO.getParent()) {
-      return nullptr;
-    }
-  }
-
-  return ResMO;
+  return MRI->getOneNonDBGUse(Reg->getReg());
 }
 
 static MachineOperand *findSingleRegDef(const MachineOperand *Reg,
@@ -313,17 +299,7 @@ static MachineOperand *findSingleRegDef(const MachineOperand *Reg,
   if (!Reg->isReg())
     return nullptr;
 
-  MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg->getReg());
-  if (!DefInstr)
-    return nullptr;
-
-  for (auto &DefMO : DefInstr->defs()) {
-    if (DefMO.isReg() && DefMO.getReg() == Reg->getReg())
-      return &DefMO;
-  }
-
-  // Ignore implicit defs.
-  return nullptr;
+  return MRI->getOneDef(Reg->getReg());
 }
 
 /// Combine an SDWA instruction's existing SDWA selection \p Sel with
diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
index efdc55b8e68b..5720b978aada 100644
--- a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
@@ -184,9 +184,11 @@ bool SIPostRABundler::run(MachineFunction &MF) {
           if (I->getNumExplicitDefs() != 0)
             Defs.insert(I->defs().begin()->getReg());
           ++ClauseLength;
-        } else if (!I->isMetaInstruction()) {
-          // Allow meta instructions in between bundle candidates, but do not
-          // start or end a bundle on one.
+        } else if (!I->isMetaInstruction() ||
+                   I->getOpcode() == AMDGPU::SCHED_BARRIER) {
+          // SCHED_BARRIER is not bundled to be honored by scheduler later.
+          // Allow other meta instructions in between bundle candidates, but do
+          // not start or end a bundle on one.
           //
           // TODO: It may be better to move meta instructions like dbg_value
           // after the bundle. We're relying on the memory legalizer to unbundle
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index ae0f304ea304..22488384759b 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3273,6 +3273,10 @@ StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const {
   return AMDGPUInstPrinter::getRegisterName(Reg);
 }
 
+unsigned SIRegisterInfo::getHWRegIndex(MCRegister Reg) const {
+  return getEncodingValue(Reg) & AMDGPU::HWEncoding::REG_IDX_MASK;
+}
+
 unsigned AMDGPU::getRegBitWidth(const TargetRegisterClass &RC) {
   return getRegBitWidth(RC.getID());
 }
@@ -3353,6 +3357,40 @@ SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const {
                                 : getAnyVGPRClassForBitWidth(BitWidth);
 }
 
+const TargetRegisterClass *
+SIRegisterInfo::getAlignedLo256VGPRClassForBitWidth(unsigned BitWidth) const {
+  if (BitWidth <= 32)
+    return &AMDGPU::VGPR_32_Lo256RegClass;
+  if (BitWidth <= 64)
+    return &AMDGPU::VReg_64_Lo256_Align2RegClass;
+  if (BitWidth <= 96)
+    return &AMDGPU::VReg_96_Lo256_Align2RegClass;
+  if (BitWidth <= 128)
+    return &AMDGPU::VReg_128_Lo256_Align2RegClass;
+  if (BitWidth <= 160)
+    return &AMDGPU::VReg_160_Lo256_Align2RegClass;
+  if (BitWidth <= 192)
+    return &AMDGPU::VReg_192_Lo256_Align2RegClass;
+  if (BitWidth <= 224)
+    return &AMDGPU::VReg_224_Lo256_Align2RegClass;
+  if (BitWidth <= 256)
+    return &AMDGPU::VReg_256_Lo256_Align2RegClass;
+  if (BitWidth <= 288)
+    return &AMDGPU::VReg_288_Lo256_Align2RegClass;
+  if (BitWidth <= 320)
+    return &AMDGPU::VReg_320_Lo256_Align2RegClass;
+  if (BitWidth <= 352)
+    return &AMDGPU::VReg_352_Lo256_Align2RegClass;
+  if (BitWidth <= 384)
+    return &AMDGPU::VReg_384_Lo256_Align2RegClass;
+  if (BitWidth <= 512)
+    return &AMDGPU::VReg_512_Lo256_Align2RegClass;
+  if (BitWidth <= 1024)
+    return &AMDGPU::VReg_1024_Lo256_Align2RegClass;
+
+  return nullptr;
+}
+
 static const TargetRegisterClass *
 getAnyAGPRClassForBitWidth(unsigned BitWidth) {
   if (BitWidth == 64)
@@ -3547,7 +3585,17 @@ bool SIRegisterInfo::isSGPRReg(const MachineRegisterInfo &MRI,
 const TargetRegisterClass *
 SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass *SRC) const {
   unsigned Size = getRegSizeInBits(*SRC);
-  const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size);
+
+  switch (SRC->getID()) {
+  default:
+    break;
+  case AMDGPU::VS_32_Lo256RegClassID:
+  case AMDGPU::VS_64_Lo256RegClassID:
+    return getAllocatableClass(getAlignedLo256VGPRClassForBitWidth(Size));
+  }
+
+  const TargetRegisterClass *VRC =
+      getAllocatableClass(getVGPRClassForBitWidth(Size));
   assert(VRC && "Invalid register class size");
   return VRC;
 }
@@ -3708,14 +3756,15 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
 
 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
                                                 unsigned Idx) const {
-  if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 ||
-      Idx == AMDGPU::RegisterPressureSets::AGPR_32)
+  switch (static_cast<AMDGPU::RegisterPressureSets>(Idx)) {
+  case AMDGPU::RegisterPressureSets::VGPR_32:
+  case AMDGPU::RegisterPressureSets::AGPR_32:
     return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
                                const_cast<MachineFunction &>(MF));
-
-  if (Idx == AMDGPU::RegisterPressureSets::SReg_32)
+  case AMDGPU::RegisterPressureSets::SReg_32:
     return getRegPressureLimit(&AMDGPU::SGPR_32RegClass,
                                const_cast<MachineFunction &>(MF));
+  }
 
   llvm_unreachable("Unexpected register pressure set!");
 }
@@ -3944,6 +3993,8 @@ bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass &RC) const {
     return RC.hasSuperClassEq(
         getVectorSuperClassForBitWidth(getRegSizeInBits(RC)));
 
+  assert(&RC != &AMDGPU::VS_64RegClass);
+
   return true;
 }
 
@@ -3956,6 +4007,9 @@ SIRegisterInfo::getProperlyAlignedRC(const TargetRegisterClass *RC) const {
   if (Size <= 32)
     return RC;
 
+  if (RC == &AMDGPU::VS_64RegClass)
+    return &AMDGPU::VS_64_Align2RegClass;
+
   if (isVGPRClass(RC))
     return getAlignedVGPRClassForBitWidth(Size);
   if (isAGPRClass(RC))
@@ -4000,7 +4054,12 @@ SIRegisterInfo::getSubRegAlignmentNumBits(const TargetRegisterClass *RC,
 unsigned SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
                                             const TargetRegisterClass &RC,
                                             bool IncludeCalls) const {
-  for (MCPhysReg Reg : reverse(RC.getRegisters()))
+  unsigned NumArchVGPRs = ST.has1024AddressableVGPRs() ? 1024 : 256;
+  ArrayRef<MCPhysReg> Registers =
+      (RC.getID() == AMDGPU::VGPR_32RegClassID)
+          ? RC.getRegisters().take_front(NumArchVGPRs)
+          : RC.getRegisters();
+  for (MCPhysReg Reg : reverse(Registers))
     if (MRI.isPhysRegUsed(Reg, /*SkipRegMaskTest=*/!IncludeCalls))
       return getHWRegIndex(Reg) + 1;
   return 0;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 5508f07b1b5f..eeefef1116aa 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -200,13 +200,14 @@ public:
   StringRef getRegAsmName(MCRegister Reg) const override;
 
   // Pseudo regs are not allowed
-  unsigned getHWRegIndex(MCRegister Reg) const {
-    return getEncodingValue(Reg) & 0xff;
-  }
+  unsigned getHWRegIndex(MCRegister Reg) const;
 
   LLVM_READONLY
   const TargetRegisterClass *getVGPRClassForBitWidth(unsigned BitWidth) const;
 
+  LLVM_READONLY const TargetRegisterClass *
+  getAlignedLo256VGPRClassForBitWidth(unsigned BitWidth) const;
+
   LLVM_READONLY
   const TargetRegisterClass *getAGPRClassForBitWidth(unsigned BitWidth) const;
 
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 0293d4018770..5f5eec49bab0 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -76,17 +76,17 @@ class SIRegisterTuples<list<SubRegIndex> Indices, RegisterClass RC,
 //===----------------------------------------------------------------------===//
 //  Declarations that describe the SI registers
 //===----------------------------------------------------------------------===//
-class SIReg <string n, bits<8> regIdx = 0, bit isVGPR = 0,
+class SIReg <string n, bits<10> regIdx = 0, bit isVGPR = 0,
              bit isAGPR = 0, bit isHi16 = 0> : Register<n> {
   let Namespace = "AMDGPU";
 
   // These are generic helper values we use to form actual register
   // codes. They should not be assumed to match any particular register
   // encodings on any particular subtargets.
-  let HWEncoding{7-0} = regIdx;
-  let HWEncoding{8} = isVGPR;
-  let HWEncoding{9} = isAGPR;
-  let HWEncoding{10} = isHi16;
+  let HWEncoding{9-0} = regIdx;
+  let HWEncoding{10} = isVGPR;
+  let HWEncoding{11} = isAGPR;
+  let HWEncoding{12} = isHi16;
 
   int Index = !cast<int>(regIdx);
 }
@@ -110,17 +110,17 @@ class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList>
   let TSFlags{3} = HasAGPR;
   let TSFlags{4} = HasSGPR;
 
-  // RA will use RegisterClass AllocationPriority amongst other info (e.g. ordering in the basic block) 
+  // RA will use RegisterClass AllocationPriority amongst other info (e.g. ordering in the basic block)
   // to decide which registers to try to assign first. Usually, this RegisterClass priority is given
   // very high priority, if not the highest priority, when considering which VirtReg to allocate next.
   //
-  // We have 5 bits to assign AllocationPriorities to RegisterClasses. Generally, it is beneficial to 
-  // assign more constrained RegisterClasses first. As a result, we prioritize register classes with 
-  // more 32 bit tuples (e.g. VReg_512) over registers with fewer tuples (e.g. VGPR_32). 
-  // 
+  // We have 5 bits to assign AllocationPriorities to RegisterClasses. Generally, it is beneficial to
+  // assign more constrained RegisterClasses first. As a result, we prioritize register classes with
+  // more 32 bit tuples (e.g. VReg_512) over registers with fewer tuples (e.g. VGPR_32).
+  //
   // The interesting case is the vector register case on architectures which have ARegs, VRegs, AVRegs.
   // In this case, we would like to assign ARegs and VRegs before AVRegs, as AVRegs are less constrained
-  // and can be assigned to both AGPRs and VGPRs. We use the 5th bit to encode this into the 
+  // and can be assigned to both AGPRs and VGPRs. We use the 5th bit to encode this into the
   // RegisterClass AllocationPriority. BaseClassPriority is used to turn the bit on, and BaseClassScaleFactor
   // is used for scaling of the bit (i.e. 1 << 4).
   field int BaseClassPriority = 1;
@@ -128,7 +128,7 @@ class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList>
 
 }
 
-multiclass SIRegLoHi16 <string n, bits<8> regIdx, bit ArtificialHigh = 1,
+multiclass SIRegLoHi16 <string n, bits<10> regIdx, bit ArtificialHigh = 1,
                         bit isVGPR = 0, bit isAGPR = 0,
                         list<int> DwarfEncodings = [-1, -1]> {
   def _LO16 : SIReg<n#".l", regIdx, isVGPR, isAGPR>;
@@ -142,9 +142,10 @@ multiclass SIRegLoHi16 <string n, bits<8> regIdx, bit ArtificialHigh = 1,
     let Namespace = "AMDGPU";
     let SubRegIndices = [lo16, hi16];
     let CoveredBySubRegs = !not(ArtificialHigh);
-    let HWEncoding{7-0} = regIdx;
-    let HWEncoding{8} = isVGPR;
-    let HWEncoding{9} = isAGPR;
+
+    let HWEncoding{9-0} = regIdx;
+    let HWEncoding{10} = isVGPR;
+    let HWEncoding{11} = isAGPR;
 
     int Index = !cast<int>(regIdx);
   }
@@ -225,7 +226,7 @@ def SGPR_NULL64 :
 // the high 32 bits. The lower 32 bits are always zero (for base) or
 // -1 (for limit). Since we cannot access the high 32 bits, when we
 // need them, we need to do a 64 bit load and extract the bits manually.
-multiclass ApertureRegister<string name, bits<8> regIdx> {
+multiclass ApertureRegister<string name, bits<10> regIdx> {
   let isConstant = true in {
     // FIXME: We shouldn't need to define subregisters for these (nor add them to any 16 bit
     //  register classes), but if we don't it seems to confuse the TableGen
@@ -313,7 +314,7 @@ foreach Index = 0...15 in {
   defm TTMP#Index           : SIRegLoHi16<"ttmp"#Index, 0>;
 }
 
-multiclass FLAT_SCR_LOHI_m <string n, bits<8> ci_e, bits<8> vi_e> {
+multiclass FLAT_SCR_LOHI_m <string n, bits<10> ci_e, bits<10> vi_e> {
   defm _ci : SIRegLoHi16<n, ci_e>;
   defm _vi : SIRegLoHi16<n, vi_e>;
   defm "" : SIRegLoHi16<n, 0>;
@@ -343,11 +344,12 @@ foreach Index = 0...105 in {
 }
 
 // VGPR registers
-foreach Index = 0...255 in {
+foreach Index = 0...1023 in {
   defm VGPR#Index :
     SIRegLoHi16 <"v"#Index, Index, /*ArtificialHigh=*/ 0,
                  /*isVGPR=*/ 1, /*isAGPR=*/ 0, /*DwarfEncodings=*/
-                 [!add(Index, 2560), !add(Index, 1536)]>;
+                [!if(!le(Index, 511), !add(Index, 2560), -1),
+                 !if(!le(Index, 511), !add(Index, 1536), !add(Index, !sub(3584, 512)))]>;
 }
 
 // AccVGPR registers
@@ -604,15 +606,15 @@ def Reg512Types : RegisterTypes<[v16i32, v16f32, v8i64, v8f64, v32i16, v32f16, v
 def Reg1024Types : RegisterTypes<[v32i32, v32f32, v16i64, v16f64]>;
 
 let HasVGPR = 1 in {
-// VOP3 and VINTERP can access 256 lo and 256 hi registers.
+// VOP3 and VINTERP can access 1024 lo and 1024 hi registers.
 def VGPR_16 : SIRegisterClass<"AMDGPU",  Reg16Types.types, 16,
-                            (add (interleave (sequence "VGPR%u_LO16", 0, 255),
-                                             (sequence "VGPR%u_HI16", 0, 255)))> {
+                            (add (interleave (sequence "VGPR%u_LO16", 0, 1023),
+                                             (sequence "VGPR%u_HI16", 0, 1023)))> {
   let AllocationPriority = !add(2, !mul(BaseClassPriority, BaseClassScaleFactor));
   let Size = 16;
   let GeneratePressureSet = 0;
 
-  // This is the base class for VGPR{128..255}_{LO16,HI16}.
+  // This is the base class for VGPR{128..1023}_{LO16,HI16}.
   let BaseClassOrder = 17;
 }
 
@@ -633,7 +635,7 @@ def VGPR_16_Lo128 : SIRegisterClass<"AMDGPU",  Reg16Types.types, 16,
 // VGPR 32-bit registers
 // i16/f16 only on VI+
 def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
-                            (add (sequence "VGPR%u", 0, 255))> {
+                            (add (sequence "VGPR%u", 0, 1023))> {
   let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor));
   let Size = 32;
   let Weight = 1;
@@ -648,46 +650,55 @@ def VGPR_32_Lo128 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg1
   let Size = 32;
   let Weight = 1;
 }
+
+// Identical to VGPR_32 except it only contains the low 256 (Lo256) registers.
+def VGPR_32_Lo256 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
+                                    (add (sequence "VGPR%u", 0, 255))> {
+  let AllocationPriority = 0;
+  let GeneratePressureSet = 0;
+  let Size = 32;
+  let Weight = 1;
+}
 } // End HasVGPR = 1
 
 // VGPR 64-bit registers
-def VGPR_64 : SIRegisterTuples<getSubRegs<2>.ret, VGPR_32, 255, 1, 2, "v">;
+def VGPR_64 : SIRegisterTuples<getSubRegs<2>.ret, VGPR_32, 1023, 1, 2, "v">;
 
 // VGPR 96-bit registers
-def VGPR_96 : SIRegisterTuples<getSubRegs<3>.ret, VGPR_32, 255, 1, 3, "v">;
+def VGPR_96 : SIRegisterTuples<getSubRegs<3>.ret, VGPR_32, 1023, 1, 3, "v">;
 
 // VGPR 128-bit registers
-def VGPR_128 : SIRegisterTuples<getSubRegs<4>.ret, VGPR_32, 255, 1, 4, "v">;
+def VGPR_128 : SIRegisterTuples<getSubRegs<4>.ret, VGPR_32, 1023, 1, 4, "v">;
 
 // VGPR 160-bit registers
-def VGPR_160 : SIRegisterTuples<getSubRegs<5>.ret, VGPR_32, 255, 1, 5, "v">;
+def VGPR_160 : SIRegisterTuples<getSubRegs<5>.ret, VGPR_32, 1023, 1, 5, "v">;
 
 // VGPR 192-bit registers
-def VGPR_192 : SIRegisterTuples<getSubRegs<6>.ret, VGPR_32, 255, 1, 6, "v">;
+def VGPR_192 : SIRegisterTuples<getSubRegs<6>.ret, VGPR_32, 1023, 1, 6, "v">;
 
 // VGPR 224-bit registers
-def VGPR_224 : SIRegisterTuples<getSubRegs<7>.ret, VGPR_32, 255, 1, 7, "v">;
+def VGPR_224 : SIRegisterTuples<getSubRegs<7>.ret, VGPR_32, 1023, 1, 7, "v">;
 
 // VGPR 256-bit registers
-def VGPR_256 : SIRegisterTuples<getSubRegs<8>.ret, VGPR_32, 255, 1, 8, "v">;
+def VGPR_256 : SIRegisterTuples<getSubRegs<8>.ret, VGPR_32, 1023, 1, 8, "v">;
 
 // VGPR 288-bit registers
-def VGPR_288 : SIRegisterTuples<getSubRegs<9>.ret, VGPR_32, 255, 1, 9, "v">;
+def VGPR_288 : SIRegisterTuples<getSubRegs<9>.ret, VGPR_32, 1023, 1, 9, "v">;
 
 // VGPR 320-bit registers
-def VGPR_320 : SIRegisterTuples<getSubRegs<10>.ret, VGPR_32, 255, 1, 10, "v">;
+def VGPR_320 : SIRegisterTuples<getSubRegs<10>.ret, VGPR_32, 1023, 1, 10, "v">;
 
 // VGPR 352-bit registers
-def VGPR_352 : SIRegisterTuples<getSubRegs<11>.ret, VGPR_32, 255, 1, 11, "v">;
+def VGPR_352 : SIRegisterTuples<getSubRegs<11>.ret, VGPR_32, 1023, 1, 11, "v">;
 
 // VGPR 384-bit registers
-def VGPR_384 : SIRegisterTuples<getSubRegs<12>.ret, VGPR_32, 255, 1, 12, "v">;
+def VGPR_384 : SIRegisterTuples<getSubRegs<12>.ret, VGPR_32, 1023, 1, 12, "v">;
 
 // VGPR 512-bit registers
-def VGPR_512 : SIRegisterTuples<getSubRegs<16>.ret, VGPR_32, 255, 1, 16, "v">;
+def VGPR_512 : SIRegisterTuples<getSubRegs<16>.ret, VGPR_32, 1023, 1, 16, "v">;
 
 // VGPR 1024-bit registers
-def VGPR_1024 : SIRegisterTuples<getSubRegs<32>.ret, VGPR_32, 255, 1, 32, "v">;
+def VGPR_1024 : SIRegisterTuples<getSubRegs<32>.ret, VGPR_32, 1023, 1, 32, "v">;
 
 let HasAGPR = 1 in {
 def AGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
@@ -976,14 +987,14 @@ class VRegClassBase<int numRegs, list<ValueType> regTypes, dag regList> :
   // Requires n v_mov_b32 to copy
   let CopyCost = numRegs;
 
-  // Since we only have 5 bits for the RegisterClass Allocation Priorty, and since we use the 
-  // 5th bit for BaseClassPriority, we need to encode the SizePriority into 4 bits. As a result 
-  // of this encoding, for registers with numRegs 15 or 16, we give SizePriority of 14, and for 
-  // regsters with numRegs 17+ we give SizePriority of 15. In  practice, there is only one 
-  // RegClass per Vector Register type in each of these groups (i.e. numRegs = 15,16 : {VReg_512}, 
-  // and numRegs = 17+ : {VReg_1024}). Therefore, we have not lost any info by compressing. 
+  // Since we only have 5 bits for the RegisterClass Allocation Priorty, and since we use the
+  // 5th bit for BaseClassPriority, we need to encode the SizePriority into 4 bits. As a result
+  // of this encoding, for registers with numRegs 15 or 16, we give SizePriority of 14, and for
+  // regsters with numRegs 17+ we give SizePriority of 15. In  practice, there is only one
+  // RegClass per Vector Register type in each of these groups (i.e. numRegs = 15,16 : {VReg_512},
+  // and numRegs = 17+ : {VReg_1024}). Therefore, we have not lost any info by compressing.
   defvar SizePrioriity = !if(!le(numRegs, 14), !sub(numRegs, 1), !if(!le(numRegs, 16), 14, 15));
-  
+
   let AllocationPriority = !add(SizePrioriity, !mul(BaseClassPriority, BaseClassScaleFactor));
   let Weight = numRegs;
 }
@@ -1003,6 +1014,10 @@ multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> {
       let BaseClassOrder = !sub(!mul(numRegs, 32), 1);
       let RegTupleAlignUnits = 2;
     }
+
+    // Aligned register tuples starting with low 256 vgprs
+    def _Lo256_Align2 : VRegClassBase<numRegs, regTypes,
+        (trunc (decimate regList, 2), !div(!sub(258, numRegs), 2))>;
   }
 }
 
@@ -1100,6 +1115,14 @@ def VS_32_Lo128 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2
   let Size = 32;
 }
 
+def VS_32_Lo256 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
+                                  (add VGPR_32_Lo256, SReg_32, LDS_DIRECT_CLASS)> {
+  let isAllocatable = 0;
+  let HasVGPR = 1;
+  let HasSGPR = 1;
+  let Size = 32;
+}
+
 def VS_64 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32, (add VReg_64, SReg_64)> {
   let isAllocatable = 0;
   let HasVGPR = 1;
@@ -1107,12 +1130,27 @@ def VS_64 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32, (add VReg_64, SReg_6
   let Size = 64;
 }
 
+def VS_64_Align2 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32,
+                                   (add VReg_64_Align2, SReg_64)> {
+  let isAllocatable = 0;
+  let HasVGPR = 1;
+  let HasSGPR = 1;
+  let Size = 64;
+}
+
 def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32, AGPR_32)> {
   let HasVGPR = 1;
   let HasAGPR = 1;
   let BaseClassPriority = 0;
   let Size = 32;
 }
+
+def VS_64_Lo256 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32, (add VReg_64_Lo256_Align2, SReg_64)> {
+  let isAllocatable = 0;
+  let HasVGPR = 1;
+  let HasSGPR = 1;
+  let Size = 64;
+}
 } // End GeneratePressureSet = 0
 
 // Define a register tuple class, along with one requiring an even
@@ -1249,15 +1287,15 @@ class SrcReg9<RegisterClass regClass> : RegisterOperand<regClass> {
   let DecoderMethod = "decodeSrcReg9<" # regClass.Size # ">";
 }
 
-def VRegSrc_32 : SrcReg9<VGPR_32>;
-def VRegSrc_64 : SrcReg9<VReg_64>;
-def VRegSrc_96 : SrcReg9<VReg_96>;
-def VRegSrc_128: SrcReg9<VReg_128>;
-def VRegSrc_192: SrcReg9<VReg_192>;
-def VRegSrc_256: SrcReg9<VReg_256>;
-def VRegSrc_384: SrcReg9<VReg_384>;
-def VRegSrc_512: SrcReg9<VReg_512>;
-def VRegSrc_1024: SrcReg9<VReg_1024>;
+def VRegSrc_32   : SrcReg9<VGPR_32>;
+def VRegSrc_64   : SrcReg9<VReg_64>;
+def VRegSrc_96   : SrcReg9<VReg_96>;
+def VRegSrc_128  : SrcReg9<VReg_128>;
+def VRegSrc_192  : SrcReg9<VReg_192>;
+def VRegSrc_256  : SrcReg9<VReg_256>;
+def VRegSrc_384  : SrcReg9<VReg_384>;
+def VRegSrc_512  : SrcReg9<VReg_512>;
+def VRegSrc_1024 : SrcReg9<VReg_1024>;
 def VRegOrLdsSrc_32 : SrcReg9<VRegOrLds_32>;
 
 // True 16 Operands
@@ -1269,30 +1307,41 @@ def VRegSrc_fake16: SrcReg9<VGPR_32> {
   let EncoderMethod = "getMachineOpValueT16";
 }
 //===----------------------------------------------------------------------===//
-// VGPRSrc_*
+// VGPROp_* An 8-bit RegisterOperand wrapper for a VGPR
 //===----------------------------------------------------------------------===//
 
-// An 8-bit RegisterOperand wrapper for a VGPR
-def VGPRSrc_32 : RegisterOperand<VGPR_32> {
-  let DecoderMethod = "DecodeVGPR_32RegisterClass";
+class VGPROp<RegisterClass regClass> : RegisterOperand<regClass> {
+  let DecoderMethod = "Decode" # regClass # "RegisterClass";
 }
-def VGPRSrc_32_Lo128 : RegisterOperand<VGPR_32_Lo128> {
-  let DecoderMethod = "DecodeVGPR_32RegisterClass";
+class VGPROp_Align2<RegisterClass regClass> : RegisterOperand<!cast<RegisterClass>(regClass#_Align2)> {
+  let DecoderMethod = "Decode" # regClass # "RegisterClass";
+}
+multiclass VGPROp_Aligned<RegisterClass regClass> {
+  def _Align1 : VGPROp<regClass>;
+  def _Align2 : VGPROp_Align2<regClass>;
 }
 
-def VGPRSrc_96 : RegisterOperand<VReg_96> {
-  let DecoderMethod = "DecodeVReg_96RegisterClass";
+// TODO: These cases should use default target alignment
+def VGPROp_16 : VGPROp<VGPR_16> {
+  let EncoderMethod = "getMachineOpValueT16";
 }
+def VGPROp_32 : VGPROp<VGPR_32>;
 
-def VGPRSrc_16_Lo128 : RegisterOperand<VGPR_16_Lo128> {
+foreach size = ["64", "96", "128", "160", "192", "224", "256", "288", "512", "1024"] in {
+  def VGPROp_#size : VGPROp<!cast<RegisterClass>("VReg_"#size)>;
+}
+
+foreach size = ["64", "96", "128", "160", "256", "1024"] in {
+  defm VGPROp_#size : VGPROp_Aligned<!cast<RegisterClass>("VReg_"#size)>;
+}
+
+def VGPROp_16_Lo128 : RegisterOperand<VGPR_16_Lo128> {
   let DecoderMethod = "DecodeVGPR_16_Lo128RegisterClass";
   let EncoderMethod = "getMachineOpValueT16Lo128";
 }
 
-// True 16 operands.
-def VGPRSrc_16 : RegisterOperand<VGPR_16> {
-  let DecoderMethod = "DecodeVGPR_16RegisterClass";
-  let EncoderMethod = "getMachineOpValueT16";
+def VGPROp_32_Lo128 : RegisterOperand<VGPR_32_Lo128> {
+  let DecoderMethod = "DecodeVGPR_32RegisterClass";
 }
 
 //===----------------------------------------------------------------------===//
@@ -1321,7 +1370,9 @@ def VCSrc_f64 : SrcRegOrImm9 <VS_64, "OPERAND_REG_INLINE_C_FP64">;
 def VCSrc_v2b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2INT16">;
 def VCSrc_v2bf16: SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2BF16">;
 def VCSrc_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2FP16">;
+def VCSrc_b32_Lo256 : SrcRegOrImm9 <VS_32_Lo256, "OPERAND_REG_INLINE_C_INT32">;
 def VCSrc_v2b32 : SrcRegOrImm9 <VS_64, "OPERAND_REG_INLINE_C_V2INT32">;
+def VCSrc_b64_Lo256 : SrcRegOrImm9 <VS_64_Lo256, "OPERAND_REG_INLINE_C_INT64">;
 
 // True 16 Operands
 def VCSrcT_b16 : SrcRegOrImm9_t16 <"OPERAND_REG_INLINE_C_INT16">;
@@ -1372,11 +1423,14 @@ class AVLdStOperand<RegisterClass regClass>
   : AVOperand<regClass, "decodeAVLdSt">;
 
 def AVLdSt_32 : AVLdStOperand<AV_32>;
-def AVLdSt_64 : AVLdStOperand<AV_64>;
-def AVLdSt_96 : AVLdStOperand<AV_96>;
-def AVLdSt_128 : AVLdStOperand<AV_128>;
-def AVLdSt_160 : AVLdStOperand<AV_160>;
-def AVLdSt_1024 : AVLdStOperand<AV_1024>;
+
+foreach size = ["64", "96", "128", "160", "256", "1024" ] in {
+  // TODO: These cases should use target align variant
+  def AVLdSt_#size : AVLdStOperand<!cast<RegisterClass>("AV_"#size)>;
+
+  def AVLdSt_#size#_Align1 : AVLdStOperand<!cast<RegisterClass>("AV_"#size)>;
+  def AVLdSt_#size#_Align2 : AVLdStOperand<!cast<RegisterClass>("AV_"#size#_Align2)>;
+}
 
 //===----------------------------------------------------------------------===//
 //  ACSrc_* Operands with an AGPR or an inline constant
@@ -1395,3 +1449,59 @@ def AISrc_512_f32 : SrcRegOrImmA9 <AReg_512, "OPERAND_REG_INLINE_AC_FP32">;
 def AISrc_512_b32 : SrcRegOrImmA9 <AReg_512, "OPERAND_REG_INLINE_AC_INT32">;
 def AISrc_1024_f32 : SrcRegOrImmA9 <AReg_1024, "OPERAND_REG_INLINE_AC_FP32">;
 def AISrc_1024_b32 : SrcRegOrImmA9 <AReg_1024, "OPERAND_REG_INLINE_AC_INT32">;
+
+//===----------------------------------------------------------------------===//
+//  Tablegen programming utilities
+//===----------------------------------------------------------------------===//
+
+/// Helper function to extract the register class from an
+/// instruction's operand list, which may be a RegisterOperand or a
+/// direct RegisterClass reference.
+class getRegClassFromOp<DAGOperand Op> {
+  SIRegisterClass ret = !if(
+    !isa<RegisterOperand>(Op),
+    !cast<SIRegisterClass>(!cast<RegisterOperand>(Op).RegClass),
+    !cast<SIRegisterClass>(Op));
+}
+
+/// Check if the operand will use an AV_* class.
+class OperandIsAV<DAGOperand Op> {
+  defvar reg_class = getRegClassFromOp<Op>.ret;
+  bit ret = !and(reg_class.HasAGPR, reg_class.HasVGPR);
+}
+
+/// Check if the operand will use an AGPR class.
+class OperandIsAGPR<DAGOperand Op> {
+  defvar reg_class = getRegClassFromOp<Op>.ret;
+  bit ret = !and(reg_class.HasAGPR, !not(reg_class.HasVGPR));
+}
+
+/// Check if the operand will use a VGPR class.
+class OperandIsVGPR<DAGOperand Op> {
+  defvar reg_class = getRegClassFromOp<Op>.ret;
+  bit ret = !and(reg_class.HasVGPR, !not(reg_class.HasAGPR));
+}
+
+class VDstOperandIsAV<dag OperandList> {
+  bit ret = OperandIsAV<!getdagarg<DAGOperand>(OperandList, "vdst")>.ret;
+}
+
+class VDstOperandIsAGPR<dag OperandList> {
+  bit ret = OperandIsAGPR<!getdagarg<DAGOperand>(OperandList, "vdst")>.ret;
+}
+
+class Data0OperandIsAV<dag OperandList> {
+  bit ret = OperandIsAV<!getdagarg<DAGOperand>(OperandList, "data0")>.ret;
+}
+
+class Data0OperandIsAGPR<dag OperandList> {
+  bit ret = OperandIsAGPR<!getdagarg<DAGOperand>(OperandList, "data0")>.ret;
+}
+
+class VDataOperandIsAV<dag OperandList> {
+  bit ret = OperandIsAV<!getdagarg<DAGOperand>(OperandList, "vdata")>.ret;
+}
+
+class VDataOperandIsAGPR<dag OperandList> {
+  bit ret = OperandIsAGPR<!getdagarg<DAGOperand>(OperandList, "vdata")>.ret;
+}
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 4bda51d1e959..781c61b073db 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -295,7 +295,6 @@ class SM_Pseudo_Atomic<string opName,
   let has_soffset = offsets.HasSOffset;
 
   let Constraints = !if(isRet, "$sdst = $sdata", "");
-  let DisableEncoding = !if(isRet, "$sdata", "");
 }
 
 multiclass SM_Pseudo_Atomics<RegisterClass baseClass,
@@ -678,7 +677,6 @@ class SMEM_Atomic_Real_vi <bits<8> op, SM_Atomic_Pseudo ps>
   bits<7> sdata;
 
   let Constraints = ps.Constraints;
-  let DisableEncoding = ps.DisableEncoding;
 
   let cpol{CPolBit.GLC} = ps.glc;
   let Inst{12-6} = !if(ps.glc, sdst{6-0}, sdata{6-0});
@@ -1295,7 +1293,6 @@ class SMEM_Atomic_Real_gfx10 <bits<8> op, SM_Atomic_Pseudo ps>
   bits<7> sdata;
 
   let Constraints = ps.Constraints;
-  let DisableEncoding = ps.DisableEncoding;
 
   let cpol{CPolBit.GLC} = ps.glc;
 
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index a003a46191a8..12a27db241c4 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -583,7 +583,6 @@ class SOP2_Real<SOP_Pseudo ps, string name = ps.Mnemonic> :
   let mayLoad              = ps.mayLoad;
   let mayStore             = ps.mayStore;
   let Constraints          = ps.Constraints;
-  let DisableEncoding      = ps.DisableEncoding;
   let Uses                 = ps.Uses;
   let Defs                 = ps.Defs;
   let isConvergent         = ps.isConvergent;
@@ -934,7 +933,7 @@ let SubtargetPredicate = HasSALUFloatInsts, mayRaiseFPException = 1,
     >;
   } // End isReMaterializable = 1
 
-  let Constraints = "$sdst = $src2", DisableEncoding="$src2",
+  let Constraints = "$sdst = $src2",
       isCommutable = 1, AddedComplexity = 20 in {
     def S_FMAC_F32 : SOP2_Pseudo<
       "s_fmac_f32", (outs SReg_32:$sdst),
@@ -949,7 +948,7 @@ let SubtargetPredicate = HasSALUFloatInsts, mayRaiseFPException = 1,
       "$sdst, $src0, $src1",
       [(set f16:$sdst, (UniformTernaryFrag<any_fma> SSrc_f16:$src0, SSrc_f16:$src1, SReg_32:$src2))]
     >;
-  } // End Constraints = "$sdst = $src2", DisableEncoding="$src2",
+  } // End Constraints = "$sdst = $src2",
     // isCommutable = 1, AddedComplexity = 20
 } // End SubtargetPredicate = HasSALUFloatInsts, mayRaiseFPException = 1,
   // Uses = [MODE], SchedRW = [WriteSFPU]
@@ -994,7 +993,6 @@ class SOPK_Real<SOPK_Pseudo ps, string name = ps.Mnemonic> :
   // copy relevant pseudo op flags
   let SubtargetPredicate = ps.SubtargetPredicate;
   let AsmMatchConverter  = ps.AsmMatchConverter;
-  let DisableEncoding    = ps.DisableEncoding;
   let Constraints        = ps.Constraints;
   let SchedRW            = ps.SchedRW;
   let mayLoad            = ps.mayLoad;
@@ -1116,8 +1114,7 @@ def S_CMPK_LT_U32 : SOPK_SCC <"s_cmpk_lt_u32", "s_cmp_lt_u32", 0>;
 def S_CMPK_LE_U32 : SOPK_SCC <"s_cmpk_le_u32", "s_cmp_le_u32", 0>;
 } // End isCompare = 1
 
-let isCommutable = 1, DisableEncoding = "$src0",
-    Constraints = "$sdst = $src0" in {
+let isCommutable = 1, Constraints = "$sdst = $src0" in {
   let Defs = [SCC] in
     def S_ADDK_I32 : SOPK_32TIE <"s_addk_i32">;
   def S_MULK_I32 : SOPK_32TIE <"s_mulk_i32">;
@@ -1656,6 +1653,11 @@ let OtherPredicates = [HasImageInsts] in {
   def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">;
 }
 
+
+let SubtargetPredicate = HasWaitXcnt in {
+  def S_WAIT_XCNT_soft : SOPP_Pseudo<"", (ins s16imm:$simm16), "$simm16">;
+}
+
 // Represents the point at which a wave must wait for all outstanding direct loads to LDS.
 // Typically inserted by the memory legalizer and consumed by SIInsertWaitcnts.
 
@@ -1847,6 +1849,13 @@ let SubtargetPredicate = HasWaitXcnt, hasSideEffects = 1 in {
     SOPP_Pseudo<"s_wait_xcnt", (ins s16imm:$simm16), "$simm16">;
 } // End SubtargetPredicate = hasWaitXcnt, hasSideEffects = 1
 
+let SubtargetPredicate = Has1024AddressableVGPRs in {
+  def S_SET_VGPR_MSB : SOPP_Pseudo<"s_set_vgpr_msb" , (ins i16imm:$simm16), "$simm16"> {
+    let hasSideEffects = 1;
+    let Defs = [MODE];
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // SOP1 Patterns
 //===----------------------------------------------------------------------===//
@@ -2694,6 +2703,7 @@ defm S_WAIT_STORECNT_DSCNT  : SOPP_Real_32_gfx12<0x049>;
 //===----------------------------------------------------------------------===//
 // SOPP - GFX1250 only.
 //===----------------------------------------------------------------------===//
+defm S_SET_VGPR_MSB   : SOPP_Real_32_gfx12<0x006>;
 defm S_SETPRIO_INC_WG : SOPP_Real_32_gfx12<0x03e>;
 defm S_WAIT_XCNT      : SOPP_Real_32_gfx12<0x045>;
 defm S_WAIT_ASYNCCNT  : SOPP_Real_32_gfx12<0x04a>;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index c740b5e0f09d..14ebbf8e9c92 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -116,6 +116,8 @@ static constexpr CustomOperand MsgOperands[] = {
   {{"MSG_RTN_GET_TBA"},         ID_RTN_GET_TBA,             isGFX11Plus},
   {{"MSG_RTN_GET_TBA_TO_PC"},   ID_RTN_GET_TBA_TO_PC,       isGFX11Plus},
   {{"MSG_RTN_GET_SE_AID_ID"},   ID_RTN_GET_SE_AID_ID,       isGFX12Plus},
+  {{"MSG_RTN_GET_CLUSTER_BARRIER_STATE"}, ID_RTN_GET_CLUSTER_BARRIER_STATE,
+                                                            isGFX1250},
 };
 
 static constexpr CustomOperand SysMsgOperands[] = {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 18ee9c16b3ff..9f4f42185d9a 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -934,6 +934,10 @@ std::optional<unsigned> InstInfo::getInvalidCompOperandIndex(
     if (!OpXRegs[CompOprIdx] || !OpYRegs[CompOprIdx])
       continue;
 
+    if (getVGPREncodingMSBs(OpXRegs[CompOprIdx], MRI) !=
+        getVGPREncodingMSBs(OpYRegs[CompOprIdx], MRI))
+      return CompOprIdx;
+
     if (SkipSrc && CompOprIdx >= Component::DST_NUM)
       continue;
 
@@ -1376,6 +1380,9 @@ unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI,
                       ? *EnableWavefrontSize32
                       : STI->getFeatureBits().test(FeatureWavefrontSize32);
 
+  if (STI->getFeatureBits().test(Feature1024AddressableVGPRs))
+    return IsWave32 ? 16 : 8;
+
   return IsWave32 ? 8 : 4;
 }
 
@@ -1396,7 +1403,10 @@ unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo *STI) { return 256; }
 
 unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI,
                                 unsigned DynamicVGPRBlockSize) {
-  if (STI->getFeatureBits().test(FeatureGFX90AInsts))
+  const auto &Features = STI->getFeatureBits();
+  if (Features.test(FeatureGFX1250Insts))
+    return Features.test(FeatureWavefrontSize32) ? 1024 : 512;
+  if (Features.test(FeatureGFX90AInsts))
     return 512;
 
   // Temporarily check the subtarget feature, until we fully switch to using
@@ -2720,13 +2730,6 @@ bool isInlineValue(unsigned Reg) {
 #undef CASE_GFXPRE11_GFX11PLUS_TO
 #undef MAP_REG2REG
 
-bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo) {
-  assert(OpNo < Desc.NumOperands);
-  unsigned OpType = Desc.operands()[OpNo].OperandType;
-  return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
-         OpType <= AMDGPU::OPERAND_SRC_LAST;
-}
-
 bool isKImmOperand(const MCInstrDesc &Desc, unsigned OpNo) {
   assert(OpNo < Desc.NumOperands);
   unsigned OpType = Desc.operands()[OpNo].OperandType;
@@ -2776,6 +2779,7 @@ unsigned getRegBitWidth(unsigned RCID) {
     return 16;
   case AMDGPU::SGPR_32RegClassID:
   case AMDGPU::VGPR_32RegClassID:
+  case AMDGPU::VGPR_32_Lo256RegClassID:
   case AMDGPU::VRegOrLds_32RegClassID:
   case AMDGPU::AGPR_32RegClassID:
   case AMDGPU::VS_32RegClassID:
@@ -2794,6 +2798,8 @@ unsigned getRegBitWidth(unsigned RCID) {
   case AMDGPU::AReg_64_Align2RegClassID:
   case AMDGPU::AV_64RegClassID:
   case AMDGPU::AV_64_Align2RegClassID:
+  case AMDGPU::VReg_64_Lo256_Align2RegClassID:
+  case AMDGPU::VS_64_Lo256RegClassID:
     return 64;
   case AMDGPU::SGPR_96RegClassID:
   case AMDGPU::SReg_96RegClassID:
@@ -2803,6 +2809,7 @@ unsigned getRegBitWidth(unsigned RCID) {
   case AMDGPU::AReg_96_Align2RegClassID:
   case AMDGPU::AV_96RegClassID:
   case AMDGPU::AV_96_Align2RegClassID:
+  case AMDGPU::VReg_96_Lo256_Align2RegClassID:
     return 96;
   case AMDGPU::SGPR_128RegClassID:
   case AMDGPU::SReg_128RegClassID:
@@ -2813,6 +2820,7 @@ unsigned getRegBitWidth(unsigned RCID) {
   case AMDGPU::AV_128RegClassID:
   case AMDGPU::AV_128_Align2RegClassID:
   case AMDGPU::SReg_128_XNULLRegClassID:
+  case AMDGPU::VReg_128_Lo256_Align2RegClassID:
     return 128;
   case AMDGPU::SGPR_160RegClassID:
   case AMDGPU::SReg_160RegClassID:
@@ -2822,6 +2830,7 @@ unsigned getRegBitWidth(unsigned RCID) {
   case AMDGPU::AReg_160_Align2RegClassID:
   case AMDGPU::AV_160RegClassID:
   case AMDGPU::AV_160_Align2RegClassID:
+  case AMDGPU::VReg_160_Lo256_Align2RegClassID:
     return 160;
   case AMDGPU::SGPR_192RegClassID:
   case AMDGPU::SReg_192RegClassID:
@@ -2831,6 +2840,7 @@ unsigned getRegBitWidth(unsigned RCID) {
   case AMDGPU::AReg_192_Align2RegClassID:
   case AMDGPU::AV_192RegClassID:
   case AMDGPU::AV_192_Align2RegClassID:
+  case AMDGPU::VReg_192_Lo256_Align2RegClassID:
     return 192;
   case AMDGPU::SGPR_224RegClassID:
   case AMDGPU::SReg_224RegClassID:
@@ -2840,6 +2850,7 @@ unsigned getRegBitWidth(unsigned RCID) {
   case AMDGPU::AReg_224_Align2RegClassID:
   case AMDGPU::AV_224RegClassID:
   case AMDGPU::AV_224_Align2RegClassID:
+  case AMDGPU::VReg_224_Lo256_Align2RegClassID:
     return 224;
   case AMDGPU::SGPR_256RegClassID:
   case AMDGPU::SReg_256RegClassID:
@@ -2850,6 +2861,7 @@ unsigned getRegBitWidth(unsigned RCID) {
   case AMDGPU::AV_256RegClassID:
   case AMDGPU::AV_256_Align2RegClassID:
   case AMDGPU::SReg_256_XNULLRegClassID:
+  case AMDGPU::VReg_256_Lo256_Align2RegClassID:
     return 256;
   case AMDGPU::SGPR_288RegClassID:
   case AMDGPU::SReg_288RegClassID:
@@ -2859,6 +2871,7 @@ unsigned getRegBitWidth(unsigned RCID) {
   case AMDGPU::AReg_288_Align2RegClassID:
   case AMDGPU::AV_288RegClassID:
   case AMDGPU::AV_288_Align2RegClassID:
+  case AMDGPU::VReg_288_Lo256_Align2RegClassID:
     return 288;
   case AMDGPU::SGPR_320RegClassID:
   case AMDGPU::SReg_320RegClassID:
@@ -2868,6 +2881,7 @@ unsigned getRegBitWidth(unsigned RCID) {
   case AMDGPU::AReg_320_Align2RegClassID:
   case AMDGPU::AV_320RegClassID:
   case AMDGPU::AV_320_Align2RegClassID:
+  case AMDGPU::VReg_320_Lo256_Align2RegClassID:
     return 320;
   case AMDGPU::SGPR_352RegClassID:
   case AMDGPU::SReg_352RegClassID:
@@ -2877,6 +2891,7 @@ unsigned getRegBitWidth(unsigned RCID) {
   case AMDGPU::AReg_352_Align2RegClassID:
   case AMDGPU::AV_352RegClassID:
   case AMDGPU::AV_352_Align2RegClassID:
+  case AMDGPU::VReg_352_Lo256_Align2RegClassID:
     return 352;
   case AMDGPU::SGPR_384RegClassID:
   case AMDGPU::SReg_384RegClassID:
@@ -2886,6 +2901,7 @@ unsigned getRegBitWidth(unsigned RCID) {
   case AMDGPU::AReg_384_Align2RegClassID:
   case AMDGPU::AV_384RegClassID:
   case AMDGPU::AV_384_Align2RegClassID:
+  case AMDGPU::VReg_384_Lo256_Align2RegClassID:
     return 384;
   case AMDGPU::SGPR_512RegClassID:
   case AMDGPU::SReg_512RegClassID:
@@ -2895,6 +2911,7 @@ unsigned getRegBitWidth(unsigned RCID) {
   case AMDGPU::AReg_512_Align2RegClassID:
   case AMDGPU::AV_512RegClassID:
   case AMDGPU::AV_512_Align2RegClassID:
+  case AMDGPU::VReg_512_Lo256_Align2RegClassID:
     return 512;
   case AMDGPU::SGPR_1024RegClassID:
   case AMDGPU::SReg_1024RegClassID:
@@ -2904,6 +2921,7 @@ unsigned getRegBitWidth(unsigned RCID) {
   case AMDGPU::AReg_1024_Align2RegClassID:
   case AMDGPU::AV_1024RegClassID:
   case AMDGPU::AV_1024_Align2RegClassID:
+  case AMDGPU::VReg_1024_Lo256_Align2RegClassID:
     return 1024;
   default:
     llvm_unreachable("Unexpected register class");
@@ -3206,8 +3224,11 @@ bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST,
 
 bool isLegalSMRDEncodedSignedOffset(const MCSubtargetInfo &ST,
                                     int64_t EncodedOffset, bool IsBuffer) {
-  if (isGFX12Plus(ST))
+  if (isGFX12Plus(ST)) {
+    if (IsBuffer && EncodedOffset < 0)
+      return false;
     return isInt<24>(EncodedOffset);
+  }
 
   return !IsBuffer && hasSMRDSignedImmOffset(ST) && isInt<21>(EncodedOffset);
 }
@@ -3321,6 +3342,112 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format,
                           : getGfx9BufferFormatInfo(Format);
 }
 
+const MCRegisterClass *getVGPRPhysRegClass(MCPhysReg Reg,
+                                           const MCRegisterInfo &MRI) {
+  const unsigned VGPRClasses[] = {
+      AMDGPU::VGPR_16RegClassID,  AMDGPU::VGPR_32RegClassID,
+      AMDGPU::VReg_64RegClassID,  AMDGPU::VReg_96RegClassID,
+      AMDGPU::VReg_128RegClassID, AMDGPU::VReg_160RegClassID,
+      AMDGPU::VReg_192RegClassID, AMDGPU::VReg_224RegClassID,
+      AMDGPU::VReg_256RegClassID, AMDGPU::VReg_288RegClassID,
+      AMDGPU::VReg_320RegClassID, AMDGPU::VReg_352RegClassID,
+      AMDGPU::VReg_384RegClassID, AMDGPU::VReg_512RegClassID,
+      AMDGPU::VReg_1024RegClassID};
+
+  for (unsigned RCID : VGPRClasses) {
+    const MCRegisterClass &RC = MRI.getRegClass(RCID);
+    if (RC.contains(Reg))
+      return &RC;
+  }
+
+  return nullptr;
+}
+
+unsigned getVGPREncodingMSBs(MCPhysReg Reg, const MCRegisterInfo &MRI) {
+  unsigned Enc = MRI.getEncodingValue(Reg);
+  unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK;
+  return Idx >> 8;
+}
+
+MCPhysReg getVGPRWithMSBs(MCPhysReg Reg, unsigned MSBs,
+                          const MCRegisterInfo &MRI) {
+  unsigned Enc = MRI.getEncodingValue(Reg);
+  unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK;
+  if (Idx >= 0x100)
+    return AMDGPU::NoRegister;
+
+  const MCRegisterClass *RC = getVGPRPhysRegClass(Reg, MRI);
+  if (!RC)
+    return AMDGPU::NoRegister;
+  return RC->getRegister(Idx | (MSBs << 8));
+}
+
+std::pair<const AMDGPU::OpName *, const AMDGPU::OpName *>
+getVGPRLoweringOperandTables(const MCInstrDesc &Desc) {
+  static const AMDGPU::OpName VOPOps[4] = {
+      AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2,
+      AMDGPU::OpName::vdst};
+  static const AMDGPU::OpName VDSOps[4] = {
+      AMDGPU::OpName::addr, AMDGPU::OpName::data0, AMDGPU::OpName::data1,
+      AMDGPU::OpName::vdst};
+  static const AMDGPU::OpName FLATOps[4] = {
+      AMDGPU::OpName::vaddr, AMDGPU::OpName::vdata,
+      AMDGPU::OpName::NUM_OPERAND_NAMES, AMDGPU::OpName::vdst};
+  static const AMDGPU::OpName BUFOps[4] = {
+      AMDGPU::OpName::vaddr, AMDGPU::OpName::NUM_OPERAND_NAMES,
+      AMDGPU::OpName::NUM_OPERAND_NAMES, AMDGPU::OpName::vdata};
+  static const AMDGPU::OpName VIMGOps[4] = {
+      AMDGPU::OpName::vaddr0, AMDGPU::OpName::vaddr1, AMDGPU::OpName::vaddr2,
+      AMDGPU::OpName::vdata};
+
+  // For VOPD instructions MSB of a corresponding Y component operand VGPR
+  // address is supposed to match X operand, otherwise VOPD shall not be
+  // combined.
+  static const AMDGPU::OpName VOPDOpsX[4] = {
+      AMDGPU::OpName::src0X, AMDGPU::OpName::vsrc1X, AMDGPU::OpName::vsrc2X,
+      AMDGPU::OpName::vdstX};
+  static const AMDGPU::OpName VOPDOpsY[4] = {
+      AMDGPU::OpName::src0Y, AMDGPU::OpName::vsrc1Y, AMDGPU::OpName::vsrc2Y,
+      AMDGPU::OpName::vdstY};
+
+  unsigned TSFlags = Desc.TSFlags;
+
+  if (TSFlags &
+      (SIInstrFlags::VOP1 | SIInstrFlags::VOP2 | SIInstrFlags::VOP3 |
+       SIInstrFlags::VOP3P | SIInstrFlags::VOPC | SIInstrFlags::DPP)) {
+    // LD_SCALE operands ignore MSB.
+    if (Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE_PAIRED_B32 ||
+        Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE_PAIRED_B32_gfx1250 ||
+        Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE16_PAIRED_B64 ||
+        Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE16_PAIRED_B64_gfx1250)
+      return {};
+    return {VOPOps, nullptr};
+  }
+
+  if (TSFlags & SIInstrFlags::DS)
+    return {VDSOps, nullptr};
+
+  if (TSFlags & SIInstrFlags::FLAT)
+    return {FLATOps, nullptr};
+
+  if (TSFlags & (SIInstrFlags::MUBUF | SIInstrFlags::MTBUF))
+    return {BUFOps, nullptr};
+
+  if (TSFlags & SIInstrFlags::VIMAGE)
+    return {VIMGOps, nullptr};
+
+  if (AMDGPU::isVOPD(Desc.getOpcode()))
+    return {VOPDOpsX, VOPDOpsY};
+
+  assert(!(TSFlags & SIInstrFlags::MIMG));
+
+  if (TSFlags & (SIInstrFlags::VSAMPLE | SIInstrFlags::EXP))
+    llvm_unreachable("Sample and export VGPR lowering is not implemented and"
+                     " these instructions are not expected on gfx1250");
+
+  return {};
+}
+
 bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode) {
   uint64_t TSFlags = MII.get(Opcode).TSFlags;
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 70dfb63cbe04..3fcd16f9290b 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1517,6 +1517,7 @@ constexpr bool mayTailCallThisCC(CallingConv::ID CC) {
   switch (CC) {
   case CallingConv::C:
   case CallingConv::AMDGPU_Gfx:
+  case CallingConv::AMDGPU_Gfx_WholeWave:
     return true;
   default:
     return canGuaranteeTCO(CC);
@@ -1590,7 +1591,14 @@ bool isInlineValue(unsigned Reg);
 
 /// Is this an AMDGPU specific source operand? These include registers,
 /// inline constants, literals and mandatory literals (KImm).
-bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo);
+constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo) {
+  return OpInfo.OperandType >= AMDGPU::OPERAND_SRC_FIRST &&
+         OpInfo.OperandType <= AMDGPU::OPERAND_SRC_LAST;
+}
+
+inline bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo) {
+  return isSISrcOperand(Desc.operands()[OpNo]);
+}
 
 /// Is this a KImm operand?
 bool isKImmOperand(const MCInstrDesc &Desc, unsigned OpNo);
@@ -1778,6 +1786,25 @@ bool isIntrinsicSourceOfDivergence(unsigned IntrID);
 /// \returns true if the intrinsic is uniform
 bool isIntrinsicAlwaysUniform(unsigned IntrID);
 
+/// \returns a register class for the physical register \p Reg if it is a VGPR
+/// or nullptr otherwise.
+const MCRegisterClass *getVGPRPhysRegClass(MCPhysReg Reg,
+                                           const MCRegisterInfo &MRI);
+
+/// \returns the MODE bits which have to be set by the S_SET_VGPR_MSB for the
+/// physical register \p Reg.
+unsigned getVGPREncodingMSBs(MCPhysReg Reg, const MCRegisterInfo &MRI);
+
+/// If \p Reg is a low VGPR return a corresponding high VGPR with \p MSBs set.
+MCPhysReg getVGPRWithMSBs(MCPhysReg Reg, unsigned MSBs,
+                          const MCRegisterInfo &MRI);
+
+// Returns a table for the opcode with a given \p Desc to map the VGPR MSB
+// set by the S_SET_VGPR_MSB to one of 4 sources. In case of VOPD returns 2
+// maps, one for X and one for Y component.
+std::pair<const AMDGPU::OpName *, const AMDGPU::OpName *>
+getVGPRLoweringOperandTables(const MCInstrDesc &Desc);
+
 /// \returns true if a memory instruction supports scale_offset modifier.
 bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode);
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index fd6253daa327..a7a0e33da5e4 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -1061,6 +1061,17 @@ VersionTuple AMDGPUPALMetadata::getPALVersion() {
   return VersionTuple(getPALVersion(0), getPALVersion(1));
 }
 
+// Set the field in a given .hardware_stages entry to a maximum value
+void AMDGPUPALMetadata::updateHwStageMaximum(unsigned CC, StringRef field,
+                                             unsigned Val) {
+  msgpack::MapDocNode HwStageFieldMapNode = getHwStage(CC);
+  auto &Node = HwStageFieldMapNode[field];
+  if (Node.isEmpty())
+    Node = Val;
+  else
+    Node = std::max<unsigned>(Node.getUInt(), Val);
+}
+
 // Set the field in a given .hardware_stages entry
 void AMDGPUPALMetadata::setHwStage(unsigned CC, StringRef field, unsigned Val) {
   getHwStage(CC)[field] = Val;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
index 4830db5fda50..e50150cc8de9 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
@@ -156,6 +156,7 @@ public:
   unsigned getPALMinorVersion();
   VersionTuple getPALVersion();
 
+  void updateHwStageMaximum(unsigned CC, StringRef field, unsigned Val);
   void setHwStage(unsigned CC, StringRef field, unsigned Val);
   void setHwStage(unsigned CC, StringRef field, bool Val);
   void setHwStage(unsigned CC, StringRef field, msgpack::Type Type,
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 11c72751dde5..f816d7de27ee 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -71,7 +71,6 @@ class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily, string real_name = ps.Mnemo
   let isCodeGenOnly = 0;
 
   let Constraints     = ps.Constraints;
-  let DisableEncoding = ps.DisableEncoding;
 
   // copy relevant pseudo op flags
   let SubtargetPredicate = ps.SubtargetPredicate;
@@ -80,7 +79,6 @@ class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily, string real_name = ps.Mnemo
   let AsmMatchConverter  = ps.AsmMatchConverter;
   let AsmVariantName     = ps.AsmVariantName;
   let Constraints        = ps.Constraints;
-  let DisableEncoding    = ps.DisableEncoding;
   let TSFlags            = ps.TSFlags;
   let UseNamedOperandTable = ps.UseNamedOperandTable;
   let Uses                 = ps.Uses;
@@ -584,7 +582,6 @@ def VOP_SWAP_I32 : VOPProfile<[i32, i32, untyped, untyped]> {
 let SubtargetPredicate = isGFX9Plus in {
   def V_SWAP_B32 : VOP1_Pseudo<"v_swap_b32", VOP_SWAP_I32, [], 1> {
     let Constraints = "$vdst = $src1, $vdst1 = $src0";
-    let DisableEncoding = "$vdst1,$src1";
     let SchedRW = [Write64Bit, Write64Bit];
   }
 
@@ -802,7 +799,6 @@ let SubtargetPredicate = isGFX10Plus in {
 
     def V_SWAPREL_B32 : VOP1_Pseudo<"v_swaprel_b32", VOP_SWAP_I32, [], 1> {
       let Constraints = "$vdst = $src1, $vdst1 = $src0";
-      let DisableEncoding = "$vdst1,$src1";
       let SchedRW = [Write64Bit, Write64Bit];
     }
   } // End Uses = [M0]
@@ -831,7 +827,6 @@ def VOP_SWAP_I16 : VOPProfile_True16<VOP_I16_I16> {
 let SubtargetPredicate = isGFX11Plus in {
   def V_SWAP_B16 : VOP1_Pseudo<"v_swap_b16", VOP_SWAP_I16, [], /* VOP1Only= */true> {
     let Constraints = "$vdst = $src1, $vdst1 = $src0";
-    let DisableEncoding = "$vdst1, $src1";
     let SchedRW = [Write64Bit, Write64Bit];
     let True16Predicate = UseRealTrue16Insts;
   }
@@ -849,7 +844,6 @@ let SubtargetPredicate = HasPrngInst in
 defm V_PRNG_B32 : VOP1Inst <"v_prng_b32", VOP_I32_I32, int_amdgcn_prng_b32>;
 
 let Constraints = "$vdst = $vdst_in, $src0_out = $src0",
-     DisableEncoding="$vdst_in,$src0_out",
      SchedRW = [Write32Bit, Write32Bit],
      isConvergent = 1 in {
 let SubtargetPredicate = HasPermlane16Swap in {
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 9de7d6d009fe..cff66aaedb11 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -105,7 +105,6 @@ class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily, string real_name = ps.Mnemo
   let isCodeGenOnly = 0;
 
   let Constraints     = ps.Constraints;
-  let DisableEncoding = ps.DisableEncoding;
 
   // copy relevant pseudo op flags
   let SubtargetPredicate = ps.SubtargetPredicate;
@@ -114,7 +113,6 @@ class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily, string real_name = ps.Mnemo
   let AsmMatchConverter  = ps.AsmMatchConverter;
   let AsmVariantName     = ps.AsmVariantName;
   let Constraints        = ps.Constraints;
-  let DisableEncoding    = ps.DisableEncoding;
   let TSFlags            = ps.TSFlags;
   let UseNamedOperandTable = ps.UseNamedOperandTable;
   let Uses                 = ps.Uses;
@@ -418,12 +416,12 @@ def VOP_MADAK_F16_t16 : VOP_MADAK <f16> {
   let IsTrue16 = 1;
   let IsRealTrue16 = 1;
   let DstRC = getVALUDstForVT<DstVT, 1/*IsTrue16*/, 0/*IsVOP3Encoding*/>.ret;
-  let Ins32 = (ins VSrcT_f16_Lo128:$src0, VGPRSrc_16_Lo128:$src1, ImmOpType:$imm);
+  let Ins32 = (ins VSrcT_f16_Lo128:$src0, VGPROp_16_Lo128:$src1, ImmOpType:$imm);
 }
 def VOP_MADAK_F16_fake16 : VOP_MADAK <f16> {
   let IsTrue16 = 1;
   let DstRC = getVALUDstForVT_fake16<DstVT>.ret;
-  let Ins32 = (ins VSrcFake16_f16_Lo128:$src0, VGPRSrc_32_Lo128:$src1, ImmOpType:$imm);
+  let Ins32 = (ins VSrcFake16_f16_Lo128:$src0, VGPROp_32_Lo128:$src1, ImmOpType:$imm);
 }
 def VOP_MADAK_F32 : VOP_MADAK <f32>;
 def VOP_MADAK_F64 : VOP_MADAK <f64>;
@@ -454,12 +452,12 @@ def VOP_MADMK_F16_t16 : VOP_MADMK <f16> {
   let IsTrue16 = 1;
   let IsRealTrue16 = 1;
   let DstRC = getVALUDstForVT<DstVT, 1/*IsTrue16*/, 0/*IsVOP3Encoding*/>.ret;
-  let Ins32 = (ins VSrcT_f16_Lo128:$src0, ImmOpType:$imm, VGPRSrc_16_Lo128:$src1);
+  let Ins32 = (ins VSrcT_f16_Lo128:$src0, ImmOpType:$imm, VGPROp_16_Lo128:$src1);
 }
 def VOP_MADMK_F16_fake16 : VOP_MADMK <f16> {
   let IsTrue16 = 1;
   let DstRC = getVALUDstForVT_fake16<DstVT>.ret;
-  let Ins32 = (ins VSrcFake16_f16_Lo128:$src0, ImmOpType:$imm, VGPRSrc_32_Lo128:$src1);
+  let Ins32 = (ins VSrcFake16_f16_Lo128:$src0, ImmOpType:$imm, VGPROp_32_Lo128:$src1);
 }
 def VOP_MADMK_F32 : VOP_MADMK <f32>;
 def VOP_MADMK_F64 : VOP_MADMK <f64>;
@@ -498,14 +496,14 @@ class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, v
                        HasClamp, HasModifiers, HasModifiers, HasOMod,
                        Src0ModVOP3DPP, Src1ModVOP3DPP, Src2Mod, HasOpSel>.ret;
   // We need a dummy src2 tied to dst to track the use of that register for s_delay_alu
-  let InsVOPDX = (ins Src0RC32:$src0X, Src1RC32:$vsrc1X, VGPRSrc_32:$src2X);
-  let InsVOPDY = (ins Src0RC32:$src0Y, Src1RC32:$vsrc1Y, VGPRSrc_32:$src2Y);
+  let InsVOPDX = (ins Src0RC32:$src0X, Src1RC32:$vsrc1X, VGPROp_32:$src2X);
+  let InsVOPDY = (ins Src0RC32:$src0Y, Src1RC32:$vsrc1Y, VGPROp_32:$src2Y);
   let InsVOPD3X = (ins Src0ModVOPD3:$src0X_modifiers, Src0VOPD3:$src0X,
                        Src1ModVOPD3:$vsrc1X_modifiers, Src1RC32:$vsrc1X,
-                       VGPRSrc_32:$src2X);
+                       VGPROp_32:$src2X);
   let InsVOPD3Y = (ins Src0ModVOPD3:$src0Y_modifiers, Src0VOPD3:$src0Y,
                        Src1ModVOPD3:$vsrc1Y_modifiers, Src1RC32:$vsrc1Y,
-                       VGPRSrc_32:$src2Y);
+                       VGPROp_32:$src2Y);
 
   let InsDPP8 = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
                      Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
@@ -567,7 +565,7 @@ def VOP_MAC_F16_t16 : VOP_MAC <f16> {
   let DstRC64 = getVALUDstForVT<DstVT, 1/*IsTrue*/, 1/*IsVOP3Encoding*/>.ret;
   let Src0RC64 = getVOP3SrcForVT<Src0VT, 1/*IsTrue16*/>.ret;
   let Src1RC64 = getVOP3SrcForVT<Src1VT, 1/*IsTrue16*/>.ret;
-  let Src0VOP3DPP = VGPRSrc_16;
+  let Src0VOP3DPP = VGPROp_16;
   let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 0/*IsFake16*/>.ret;
   let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 0/*IsFake16*/>.ret;
   let Src0ModVOP3DPP = getSrc0ModVOP3DPP<Src0VT, DstVT, 0/*IsFake16*/>.ret;
@@ -599,7 +597,7 @@ def VOP_MAC_F16_fake16 : VOP_MAC <f16> {
                      getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret:$src2, // stub argument
                      dpp8:$dpp8, Dpp8FI:$fi);
   let DstRC64 = getVALUDstForVT<DstVT>.ret;
-  let Src0VOP3DPP = VGPRSrc_32;
+  let Src0VOP3DPP = VGPROp_32;
   let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 1/*IsFake16*/>.ret;
   let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 1/*IsFake16*/>.ret;
   let Src0ModVOP3DPP = getSrc0ModVOP3DPP<Src0VT, DstVT, 1/*IsFake16*/>.ret;
@@ -798,7 +796,7 @@ def VOP2e_I16_I16_I16_I1_true16 : VOP2e_SGPR<[i16, i16, i16, i1]> {
                     Src2RC64, NumSrcArgs,
                     HasClamp, 1/*HasModifiers*/, 0/*HasSrc2Mods*/, HasOMod,
                     Src0Mod, Src1Mod, Src2Mod, 1/*HasOpSel*/>.ret;
-  let Src0VOP3DPP = VGPRSrc_16;
+  let Src0VOP3DPP = VGPROp_16;
   let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 0/*IsFake16*/>.ret;
   let Src0ModVOP3DPP = getSrc0ModVOP3DPP<f16, DstVT, 0/*IsFake16*/>.ret;
   let Src1ModVOP3DPP = getSrcModVOP3VC<f16, 0/*IsFake16*/>.ret;
@@ -810,7 +808,7 @@ def VOP2e_I16_I16_I16_I1_fake16 : VOP2e_SGPR<[i16, i16, i16, i1]> {
   let Src0Mod = getSrc0Mod<f16, DstVT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
   let Src1Mod = getSrcMod<f16, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
 
-  let Src0VOP3DPP = VGPRSrc_32;
+  let Src0VOP3DPP = VGPROp_32;
   let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 1/*IsFake16*/>.ret;
   let Src0ModVOP3DPP = getSrc0ModVOP3DPP<f16, DstVT, 1/*IsFake16*/>.ret;
   let Src1ModVOP3DPP = getSrcModVOP3VC<f16, 1/*IsFake16*/>.ret;
@@ -889,13 +887,13 @@ defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, xor>;
 
 let mayRaiseFPException = 0 in {
 let OtherPredicates = [HasMadMacF32Insts] in {
-let Constraints = "$vdst = $src2", DisableEncoding="$src2",
+let Constraints = "$vdst = $src2",
     isConvertibleToThreeAddress = 1 in {
 defm V_MAC_F32 : VOP2Inst <"v_mac_f32", VOP_MAC_F32>;
 
 let SubtargetPredicate = isGFX6GFX7GFX10 in
 defm V_MAC_LEGACY_F32 : VOP2Inst <"v_mac_legacy_f32", VOP_MAC_LEGACY_F32>;
-} // End Constraints = "$vdst = $src2", DisableEncoding="$src2",
+} // End Constraints = "$vdst = $src2",
   //     isConvertibleToThreeAddress = 1
 
 let isReMaterializable = 1 in
@@ -941,9 +939,9 @@ defm V_MUL_U64 : VOP2Inst <"v_mul_u64", VOP_I64_I64_I64, DivergentBinFrag<mul>>;
 // These are special and do not read the exec mask.
 let isConvergent = 1, Uses = []<Register> in {
 def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE, []>;
-let IsNeverUniform = 1, Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
+let IsNeverUniform = 1, Constraints = "$vdst = $vdst_in" in {
 def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, []>;
-} // End IsNeverUniform, $vdst = $vdst_in, DisableEncoding $vdst_in
+} // End IsNeverUniform, $vdst = $vdst_in
 } // End isConvergent = 1
 
 foreach vt = Reg32Types.types in {
@@ -1175,7 +1173,6 @@ let True16Predicate = UseFakeTrue16Insts in {
 } // End FPDPRounding  = 1, isReMaterializable = 1, FixedSize = 1
 
 let Constraints = "$vdst = $src2",
-    DisableEncoding="$src2",
     isConvertibleToThreeAddress = 1,
     isCommutable = 1 in {
 let SubtargetPredicate = isGFX10Plus in {
@@ -1209,7 +1206,7 @@ let SubtargetPredicate = isGFX8GFX9 in {
 } // End isReMaterializable = 1
 
 // FIXME: Missing FPDPRounding
-let Constraints = "$vdst = $src2", DisableEncoding="$src2",
+let Constraints = "$vdst = $src2",
     isConvertibleToThreeAddress = 1, isCommutable = 1 in {
 defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>;
 }
@@ -1252,7 +1249,6 @@ def : GCNPat<
 >;
 
 let Constraints = "$vdst = $src2",
-    DisableEncoding = "$src2",
     isConvertibleToThreeAddress = 1,
     isCommutable = 1 in
 defm V_FMAC_F32 : VOP2Inst_VOPD <"v_fmac_f32", VOP_MAC_F32, 0x0, "v_fmac_f32">;
@@ -1261,7 +1257,6 @@ defm V_FMAC_F32 : VOP2Inst_VOPD <"v_fmac_f32", VOP_MAC_F32, 0x0, "v_fmac_f32">;
 let SubtargetPredicate = HasFmaLegacy32 in {
 
 let Constraints = "$vdst = $src2",
-    DisableEncoding = "$src2",
     isConvertibleToThreeAddress = 1,
     isCommutable = 1 in
 defm V_FMAC_LEGACY_F32 : VOP2Inst <"v_fmac_legacy_f32", VOP_MAC_LEGACY_F32>;
@@ -1270,14 +1265,12 @@ defm V_FMAC_LEGACY_F32 : VOP2Inst <"v_fmac_legacy_f32", VOP_MAC_LEGACY_F32>;
 
 let SubtargetPredicate = HasFmacF64Inst,
     Constraints = "$vdst = $src2",
-    DisableEncoding="$src2",
     isConvertibleToThreeAddress = 1,
     isCommutable = 1,
     SchedRW = [WriteDoubleAdd] in
 defm V_FMAC_F64 : VOP2Inst <"v_fmac_f64", VOP_MAC_F64>;
 
 let Constraints = "$vdst = $src2",
-      DisableEncoding="$src2",
       isConvertibleToThreeAddress = 1,
       isCommutable = 1,
       IsDOT = 1 in {
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 329d003cf250..19eabb46752b 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -782,7 +782,7 @@ defm V_LSHL_ADD_U64 : VOP3Inst <"v_lshl_add_u64", V_LSHL_ADD_U64_PROF>;
 
 let OtherPredicates = [HasFP8ConversionInsts], mayRaiseFPException = 0,
     SchedRW = [WriteFloatCvt] in {
-  let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in {
+  let Constraints = "$vdst = $vdst_in" in {
     let OtherPredicates = [HasFP8ConversionInsts, NotHasFP8E5M3Insts] in
       defm V_CVT_PK_FP8_F32 : VOP3Inst_t16_with_profiles<"v_cvt_pk_fp8_f32", VOP3_CVT_PK_F8_F32_Profile<>,
                                                           VOP3_CVT_PK_F8_F32_Profile_t16<>,
@@ -807,7 +807,7 @@ let OtherPredicates = [HasFP8ConversionInsts], mayRaiseFPException = 0,
   // These instructions have non-standard use of op_sel. In particular they are
   // using op_sel bits 2 and 3 while only having two sources. Therefore dummy
   // src2 is used to hold the op_sel value.
-  let Constraints = "$vdst = $src2", DisableEncoding = "$src2", SubtargetPredicate = isGFX940Plus in {
+  let Constraints = "$vdst = $src2", SubtargetPredicate = isGFX940Plus in {
     defm V_CVT_SR_FP8_F32 : VOP3Inst<"v_cvt_sr_fp8_f32", VOP3_CVT_SR_F8_F32_Profile>;
     defm V_CVT_SR_BF8_F32 : VOP3Inst<"v_cvt_sr_bf8_f32", VOP3_CVT_SR_F8_F32_Profile>;
   }
@@ -1309,7 +1309,7 @@ class VOP3_CVT_SCALEF32_PK_F864_Profile<VOPProfile P> : VOP3_Profile<P> {
 }
 
 let SubtargetPredicate = HasFP8ConversionScaleInsts, mayRaiseFPException = 0 in {
-  let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
+  let Constraints = "$vdst = $vdst_in" in {
     defm V_CVT_SCALEF32_SR_FP8_BF16 : VOP3Inst<"v_cvt_scalef32_sr_fp8_bf16", VOP3_CVT_SCALE_SR_F8BF8_F16BF16F32_TiedInput_Profile<VOP_I32_BF16_I32_F32>>;
     defm V_CVT_SCALEF32_SR_FP8_F16 : VOP3Inst<"v_cvt_scalef32_sr_fp8_f16", VOP3_CVT_SCALE_SR_F8BF8_F16BF16F32_TiedInput_Profile<VOP_I32_F16_I32_F32>>;
     defm V_CVT_SCALEF32_SR_FP8_F32 : VOP3Inst<"v_cvt_scalef32_sr_fp8_f32", VOP3_CVT_SCALE_SR_F8BF8_F16BF16F32_TiedInput_Profile<VOP_I32_F32_I32_F32>>;
@@ -1325,7 +1325,7 @@ let SubtargetPredicate = HasFP8ConversionScaleInsts, mayRaiseFPException = 0 in
 }
 
 let SubtargetPredicate = HasBF8ConversionScaleInsts, mayRaiseFPException = 0 in {
-  let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
+  let Constraints = "$vdst = $vdst_in" in {
     defm V_CVT_SCALEF32_SR_BF8_BF16 : VOP3Inst<"v_cvt_scalef32_sr_bf8_bf16", VOP3_CVT_SCALE_SR_F8BF8_F16BF16F32_TiedInput_Profile<VOP_I32_BF16_I32_F32>>;
     defm V_CVT_SCALEF32_SR_BF8_F16 : VOP3Inst<"v_cvt_scalef32_sr_bf8_f16", VOP3_CVT_SCALE_SR_F8BF8_F16BF16F32_TiedInput_Profile<VOP_I32_F16_I32_F32>>;
     defm V_CVT_SCALEF32_SR_BF8_F32 : VOP3Inst<"v_cvt_scalef32_sr_bf8_f32", VOP3_CVT_SCALE_SR_F8BF8_F16BF16F32_TiedInput_Profile<VOP_I32_F32_I32_F32>>;
@@ -1342,7 +1342,7 @@ let SubtargetPredicate = HasBF8ConversionScaleInsts, mayRaiseFPException = 0 in
 
 let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in {
   defm V_CVT_SCALEF32_PK_F32_FP4 : VOP3Inst<"v_cvt_scalef32_pk_f32_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2f32>>;
-  let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
+  let Constraints = "$vdst = $vdst_in" in {
     defm V_CVT_SCALEF32_PK_FP4_F32 : VOP3Inst<"v_cvt_scalef32_pk_fp4_f32", VOP3_CVT_SCALE_FP4_F32_TiedInput_Profile<VOP_I32_F32_F32_F32>>;
     let Constraints = "@earlyclobber $vdst" in {
       defm V_CVT_SCALEF32_SR_PK_FP4_F16:  VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2f16>>;
@@ -1358,7 +1358,7 @@ let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in
 
   // These instructions have non-standard use of op_sel. In particular they are
   // using op_sel bits 2 and 3 while only having two sources.
-  let Constraints = "$vdst = $src2", DisableEncoding = "$src2" in {
+  let Constraints = "$vdst = $src2" in {
     defm V_CVT_SCALEF32_PK_FP4_F16 : VOP3Inst<"v_cvt_scalef32_pk_fp4_f16", VOP3_CVT_SCALE_FP4_F16BF16_TiedInput_Profile<VOP_I32_V2F16_F32_F32>>;
     defm V_CVT_SCALEF32_PK_FP4_BF16 : VOP3Inst<"v_cvt_scalef32_pk_fp4_bf16", VOP3_CVT_SCALE_FP4_F16BF16_TiedInput_Profile<VOP_I32_V2BF16_F32_F32>>;
   }
@@ -1486,10 +1486,10 @@ let SubtargetPredicate = isGFX10Plus in {
   } // End isCommutable = 1, isReMaterializable = 1
   def : ThreeOp_i32_Pats<xor, xor, V_XOR3_B32_e64>;
 
-  let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in", isConvergent = 1 in {
+  let Constraints = "$vdst = $vdst_in", isConvergent = 1 in {
     defm V_PERMLANE16_B32 : VOP3Inst<"v_permlane16_b32", VOP3_PERMLANE_Profile>;
     defm V_PERMLANEX16_B32 : VOP3Inst<"v_permlanex16_b32", VOP3_PERMLANE_Profile>;
-  } // End $vdst = $vdst_in, DisableEncoding $vdst_in, isConvergent = 1
+  } // End $vdst = $vdst_in, isConvergent = 1
 
   foreach vt = Reg32Types.types in {
     def : PermlanePat<int_amdgcn_permlane16, V_PERMLANE16_B32_e64, vt>;
@@ -1532,10 +1532,10 @@ let True16Predicate = UseFakeTrue16Insts in {
 } // End True16Predicate = UseFakeTrue16Insts
 
 let SubtargetPredicate = isGFX12Plus in {
-  let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
+  let Constraints = "$vdst = $vdst_in" in {
     defm V_PERMLANE16_VAR_B32  : VOP3Inst<"v_permlane16_var_b32",  VOP3_PERMLANE_VAR_Profile>;
     defm V_PERMLANEX16_VAR_B32 : VOP3Inst<"v_permlanex16_var_b32", VOP3_PERMLANE_VAR_Profile>;
-  } // End $vdst = $vdst_in, DisableEncoding $vdst_in
+  } // End $vdst = $vdst_in
 
   def : PermlaneVarPat<int_amdgcn_permlane16_var,  V_PERMLANE16_VAR_B32_e64>;
   def : PermlaneVarPat<int_amdgcn_permlanex16_var, V_PERMLANEX16_VAR_B32_e64>;
@@ -1763,7 +1763,7 @@ let SubtargetPredicate = isGFX1250Plus in {
 
     // These instructions have non-standard use of op_sel. They are using bits 2 and 3 of opsel
     // to select a byte in the vdst. Bits 0 and 1 are unused.
-    let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in {
+    let Constraints = "$vdst = $vdst_in" in {
       defm V_CVT_SR_FP8_F16 : VOP3Inst_t16_with_profiles<"v_cvt_sr_fp8_f16", VOP3_CVT_SR_F8_F16_Profile,
                                                           VOP3_CVT_SR_F8_F16_True16_Profile, VOP3_CVT_SR_F8_F16_Fake16_Profile>;
       defm V_CVT_SR_BF8_F16 : VOP3Inst_t16_with_profiles<"v_cvt_sr_bf8_f16", VOP3_CVT_SR_F8_F16_Profile,
@@ -1850,7 +1850,7 @@ class Cvt_Scale_Sr_F32ToBF16F16_Pat<SDPatternOperator node, VOP3_Pseudo inst, Va
 >;
 
 let SubtargetPredicate = HasF32ToF16BF16ConversionSRInsts in {
-  let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in {
+  let Constraints = "$vdst = $vdst_in" in {
     defm V_CVT_SR_F16_F32   : VOP3Inst<"v_cvt_sr_f16_f32", VOP3_CVT_SR_FP16_TiedInput_Profile<VOP_F16_F32_I32>>;
     defm V_CVT_SR_BF16_F32  : VOP3Inst<"v_cvt_sr_bf16_f32", VOP3_CVT_SR_FP16_TiedInput_Profile<VOP_BF16_F32_I32>>;
   }
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index ce280d484da1..6f778a0d262a 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -44,7 +44,7 @@ class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR,
                FP16InputMods:$src1_modifiers, Src1RC:$src1,
                FP16InputMods:$src2_modifiers, Src2RC:$src2);
     dag dpp_srcs =
-          (ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0,
+          (ins FPVRegInputMods:$src0_modifiers, VGPROp_32:$src0,
                FPVRegInputMods:$src1_modifiers, VRegSrc_32:$src1,
                FP16InputMods:$src2_modifiers, Src2RC:$src2);
 
@@ -84,7 +84,6 @@ multiclass VOP3PInst<string OpName, VOPProfile P,
 multiclass VOP3_VOP3PInst<string OpName, VOP3P_Mix_Profile P> {
   def NAME : VOP3P_Pseudo<OpName, P> {
     let Constraints = !if(P.UseTiedOutput, "$vdst = $vdst_in", "");
-    let DisableEncoding = !if(P.UseTiedOutput, "$vdst_in", "");
   }
   let SubtargetPredicate = isGFX11Plus in {
     if P.HasExtVOP3DPP then
@@ -92,7 +91,6 @@ multiclass VOP3_VOP3PInst<string OpName, VOP3P_Mix_Profile P> {
         let VOP3P = 1;
         let PseudoInstr = OpName#"_dpp";
         let Constraints = !if(P.UseTiedOutput, "$vdst = $vdst_in", "");
-        let DisableEncoding = !if(P.UseTiedOutput, "$vdst_in", "");
       }
   } // end SubtargetPredicate = isGFX11Plus
 }
@@ -1166,7 +1164,7 @@ let SubtargetPredicate = HasFP8Insts, is_gfx940_xdl = 1 in {
 } // End SubtargetPredicate = HasFP8Insts, is_gfx940_xdl = 1
 
 multiclass SMFMACInst<string OpName, string P, SDPatternOperator node> {
-  let Constraints = "$vdst = $src2", DisableEncoding = "$src2",
+  let Constraints = "$vdst = $src2",
       isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1, is_gfx940_xdl = 1 in {
     def _e64 : MAIInst<OpName, !cast<VOPProfileSMFMAC>("VOPProfileSMFMAC_" # P), node>;
   }
@@ -1520,8 +1518,8 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
   dag MatrixFMT = !if(HasMatrixFMT, (ins MatrixAFMT:$matrix_a_fmt, MatrixBFMT:$matrix_b_fmt),
                                    (ins));
   dag MatrixScaleSrc = !if(HasMatrixScale,
-                           !if(Scale16, (ins VCSrc_b64:$scale_src0, VCSrc_b64:$scale_src1),
-                                        (ins VCSrc_b32:$scale_src0, VCSrc_b32:$scale_src1)),
+                           !if(Scale16, (ins VCSrc_b64_Lo256:$scale_src0, VCSrc_b64_Lo256:$scale_src1),
+                                        (ins VCSrc_b32_Lo256:$scale_src0, VCSrc_b32_Lo256:$scale_src1)),
                            (ins));
   dag MatrixScale = !if(HasMatrixScale, (ins MatrixAScale:$matrix_a_scale, MatrixBScale:$matrix_b_scale,
                                              MatrixAScaleFmt:$matrix_a_scale_fmt, MatrixBScaleFmt:$matrix_b_scale_fmt),
@@ -1859,8 +1857,8 @@ defm V_WMMA_SCALE_F32_32X16X128_F4_w32   : WMMAInstGFX12<"v_wmma_scale_f32_32x16
 defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale16_f32_32x16x128_f4", F32_32X16X128_F4_SCALE16_w32, "_w32">;
 } // End is_wmma_xdl = 1.
 
-defm V_WMMA_LD_SCALE_PAIRED_B32   : VOP3PInst<"v_wmma_ld_scale_paired_b32",   VOP_WMMA_LD_SCALE<i32, VCSrc_b32>>;
-defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64>>;
+defm V_WMMA_LD_SCALE_PAIRED_B32   : VOP3PInst<"v_wmma_ld_scale_paired_b32",   VOP_WMMA_LD_SCALE<i32, VCSrc_b32_Lo256>>;
+defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64_Lo256>>;
 } // End SubtargetPredicate = isGFX125xOnly
 } // End WaveSizePredicate = isWave32
 
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index 2c1193509da9..2730ec52294e 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -100,7 +100,7 @@ multiclass VOPC_Profile_t16<list<SchedReadWrite> sched, ValueType vt0, ValueType
     let Src0ModDPP = getSrcModDPP_t16<Src0VT, 0/*IsFake16*/>.ret;
     let Src1ModDPP = getSrcModDPP_t16<Src1VT, 0/*IsFake16*/>.ret;
     let Src2ModDPP = getSrcModDPP_t16<Src2VT, 0/*IsFake16*/>.ret;
-    let Src0VOP3DPP = VGPRSrc_16;
+    let Src0VOP3DPP = VGPROp_16;
     let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 0/*IsFake16*/>.ret;
     let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 0/*IsFake16*/>.ret;
 
@@ -126,7 +126,7 @@ multiclass VOPC_Profile_t16<list<SchedReadWrite> sched, ValueType vt0, ValueType
     let Src0ModDPP = getSrcModDPP_t16<Src0VT, 1/*IsFake16*/>.ret;
     let Src1ModDPP = getSrcModDPP_t16<Src1VT, 1/*IsFake16*/>.ret;
     let Src2ModDPP = getSrcModDPP_t16<Src2VT, 1/*IsFake16*/>.ret;
-    let Src0VOP3DPP = VGPRSrc_32;
+    let Src0VOP3DPP = VGPROp_32;
     let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 1/*IsFake16*/>.ret;
     let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 1/*IsFake16*/>.ret;
 
@@ -173,7 +173,7 @@ multiclass VOPC_NoSdst_Profile_t16<list<SchedReadWrite> sched, ValueType vt0, Va
     let Src0ModDPP = getSrcModDPP_t16<Src0VT, 0/*IsFake16*/>.ret;
     let Src1ModDPP = getSrcModDPP_t16<Src1VT, 0/*IsFake16*/>.ret;
     let Src2ModDPP = getSrcModDPP_t16<Src2VT, 0/*IsFake16*/>.ret;
-    let Src0VOP3DPP = VGPRSrc_16;
+    let Src0VOP3DPP = VGPROp_16;
     let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 0/*IsFake16*/>.ret;
     let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 0/*IsFake16*/>.ret;
 
@@ -197,7 +197,7 @@ multiclass VOPC_NoSdst_Profile_t16<list<SchedReadWrite> sched, ValueType vt0, Va
     let Src0ModDPP = getSrcModDPP_t16<Src0VT, 1/*IsFake16*/>.ret;
     let Src1ModDPP = getSrcModDPP_t16<Src1VT, 1/*IsFake16*/>.ret;
     let Src2ModDPP = getSrcModDPP_t16<Src2VT, 1/*IsFake16*/>.ret;
-    let Src0VOP3DPP = VGPRSrc_32;
+    let Src0VOP3DPP = VGPROp_32;
     let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 1/*IsFake16*/>.ret;
     let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 1/*IsFake16*/>.ret;
 
@@ -251,7 +251,6 @@ class VOPC_Real <VOPC_Pseudo ps, int EncodingFamily, string asm_name = ps.Pseudo
   let isCodeGenOnly = 0;
 
   let Constraints     = ps.Constraints;
-  let DisableEncoding = ps.DisableEncoding;
 
   // copy relevant pseudo op flags
   let SubtargetPredicate = ps.SubtargetPredicate;
@@ -259,7 +258,6 @@ class VOPC_Real <VOPC_Pseudo ps, int EncodingFamily, string asm_name = ps.Pseudo
   let OtherPredicates    = ps.OtherPredicates;
   let AsmMatchConverter  = ps.AsmMatchConverter;
   let Constraints        = ps.Constraints;
-  let DisableEncoding    = ps.DisableEncoding;
   let TSFlags            = ps.TSFlags;
   let UseNamedOperandTable = ps.UseNamedOperandTable;
   let Uses                 = ps.Uses;
@@ -894,7 +892,7 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType src0VT, ValueType
   // DPP8 forbids modifiers and can inherit from VOPC_Profile
 
   let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
-  dag InsPartVOP3DPP = (ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0, VCSrc_b32:$src1);
+  dag InsPartVOP3DPP = (ins FPVRegInputMods:$src0_modifiers, VGPROp_32:$src0, VCSrc_b32:$src1);
   let InsVOP3Base = !con(InsPartVOP3DPP, !if(HasOpSel, (ins op_sel0:$op_sel),
                                                        (ins)));
   let AsmVOP3Base = "$sdst, $src0_modifiers, $src1";
@@ -917,7 +915,7 @@ multiclass VOPC_Class_Profile_t16<list<SchedReadWrite> sched> {
     let Src0ModDPP = getSrcModDPP_t16<Src0VT, 0/*IsFake16*/>.ret;
     let Src1ModDPP = getSrcModDPP_t16<Src1VT, 0/*IsFake16*/>.ret;
     let Src2ModDPP = getSrcModDPP_t16<Src2VT, 0/*IsFake16*/>.ret;
-    let Src0VOP3DPP = VGPRSrc_16;
+    let Src0VOP3DPP = VGPROp_16;
     let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 0/*IsFake16*/>.ret;
     let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 0/*IsFake16*/>.ret;
 
@@ -943,7 +941,7 @@ multiclass VOPC_Class_Profile_t16<list<SchedReadWrite> sched> {
     let Src0ModDPP = getSrcModDPP_t16<Src0VT, 1/*IsFake16*/>.ret;
     let Src1ModDPP = getSrcModDPP_t16<Src1VT, 1/*IsFake16*/>.ret;
     let Src2ModDPP = getSrcModDPP_t16<Src2VT, 1/*IsFake16*/>.ret;
-    let Src0VOP3DPP = VGPRSrc_32;
+    let Src0VOP3DPP = VGPROp_32;
     let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 1/*IsFake16*/>.ret;
     let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 1/*IsFake16*/>.ret;
 
@@ -987,7 +985,7 @@ multiclass VOPC_Class_NoSdst_Profile_t16<list<SchedReadWrite> sched> {
     let Src0ModDPP = getSrcModDPP_t16<Src0VT, 0/*IsFake16*/>.ret;
     let Src1ModDPP = getSrcModDPP_t16<Src1VT, 0/*IsFake16*/>.ret;
     let Src2ModDPP = getSrcModDPP_t16<Src2VT, 0/*IsFake16*/>.ret;
-    let Src0VOP3DPP = VGPRSrc_16;
+    let Src0VOP3DPP = VGPROp_16;
     let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 0/*IsFake16*/>.ret;
     let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 0/*IsFake16*/>.ret;
 
@@ -1011,7 +1009,7 @@ multiclass VOPC_Class_NoSdst_Profile_t16<list<SchedReadWrite> sched> {
     let Src0ModDPP = getSrcModDPP_t16<Src0VT, 1/*IsFake16*/>.ret;
     let Src1ModDPP = getSrcModDPP_t16<Src1VT, 1/*IsFake16*/>.ret;
     let Src2ModDPP = getSrcModDPP_t16<Src2VT, 1/*IsFake16*/>.ret;
-    let Src0VOP3DPP = VGPRSrc_32;
+    let Src0VOP3DPP = VGPROp_32;
     let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 1/*IsFake16*/>.ret;
     let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 1/*IsFake16*/>.ret;
 
diff --git a/llvm/lib/Target/AMDGPU/VOPDInstructions.td b/llvm/lib/Target/AMDGPU/VOPDInstructions.td
index 3e7af12f6b60..f416c0654048 100644
--- a/llvm/lib/Target/AMDGPU/VOPDInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPDInstructions.td
@@ -138,10 +138,6 @@ class VOPD_Base<dag outs, dag ins, string asm, VOP_Pseudo VDX, VOP_Pseudo VDY,
   string ConstraintsY = !if(hasSrc2AccY, "$src2Y = $vdstY", "");
   let Constraints =
       ConstraintsX # !if(!and(hasSrc2AccX, hasSrc2AccY), ", ", "") # ConstraintsY;
-  string DisableEncodingX = !if(hasSrc2AccX, "$src2X", "");
-  string DisableEncodingY = !if(hasSrc2AccY, "$src2Y", "");
-  let DisableEncoding =
-      DisableEncodingX # !if(!and(hasSrc2AccX, hasSrc2AccY), ", ", "") # DisableEncodingY;
 
   let Uses = RegListUnion<VDX.Uses, VDY.Uses>.ret;
   let Defs = RegListUnion<VDX.Defs, VDY.Defs>.ret;
@@ -228,7 +224,7 @@ foreach Gen = [GFX11GenD, GFX12GenD, GFX1250GenD] in {
       defvar isOpXMADK = !or(!eq(x, "V_FMAAK_F32"), !eq(x, "V_FMAMK_F32"));
       defvar isOpYMADK = !or(!eq(y, "V_FMAAK_F32"), !eq(y, "V_FMAMK_F32"));
       defvar OpName = "V_DUAL_" # !substr(x,2) # "_X_" # !substr(y,2) # Gen.Suffix;
-      defvar outs = (outs VGPRSrc_32:$vdstX, VOPDDstYOperand:$vdstY);
+      defvar outs = (outs VGPROp_32:$vdstX, VOPDDstYOperand:$vdstY);
       if !or(isOpXMADK, isOpYMADK) then {
         // If Both X and Y are MADK, the mandatory literal of X additionally must
         // use an alternate operand format which defers to the 'real' Y literal.
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 3cad5a1c2c37..5550a0c08b91 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -187,7 +187,6 @@ class VOP3_Real <VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemoni
   let AsmMatchConverter  = ps.AsmMatchConverter;
   let AsmVariantName     = ps.AsmVariantName;
   let Constraints        = ps.Constraints;
-  let DisableEncoding    = ps.DisableEncoding;
   let TSFlags            = ps.TSFlags;
   let UseNamedOperandTable = ps.UseNamedOperandTable;
   let Uses                 = ps.Uses;
@@ -807,7 +806,6 @@ class VOP_SDWA8_Real <VOP_SDWA_Pseudo ps> :
   let hasSideEffects = ps.hasSideEffects;
 
   let Constraints     = ps.Constraints;
-  let DisableEncoding = ps.DisableEncoding;
 
   // Copy relevant pseudo op flags
   let SubtargetPredicate   = ps.SubtargetPredicate;
@@ -817,7 +815,6 @@ class VOP_SDWA8_Real <VOP_SDWA_Pseudo ps> :
   let UseNamedOperandTable = ps.UseNamedOperandTable;
   let DecoderNamespace     = ps.DecoderNamespace;
   let Constraints          = ps.Constraints;
-  let DisableEncoding      = ps.DisableEncoding;
   let TSFlags              = ps.TSFlags;
   let Uses                 = ps.Uses;
   let Defs                 = ps.Defs;
@@ -841,7 +838,6 @@ class Base_VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
   let hasSideEffects = ps.hasSideEffects;
 
   let Constraints     = ps.Constraints;
-  let DisableEncoding = ps.DisableEncoding;
 
   let SubtargetPredicate = ps.SubtargetPredicate;
   let AssemblerPredicate = HasSDWA9;
@@ -854,7 +850,6 @@ class Base_VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
   let AsmMatchConverter    = ps.AsmMatchConverter;
   let UseNamedOperandTable = ps.UseNamedOperandTable;
   let Constraints          = ps.Constraints;
-  let DisableEncoding      = ps.DisableEncoding;
   let TSFlags              = ps.TSFlags;
   let Uses                 = ps.Uses;
   let Defs                 = ps.Defs;
@@ -1037,7 +1032,6 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[],
   let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP,
                                         AMDGPUAsmVariants.Disable);
   let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");
-  let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, "");
   let DecoderNamespace = "GFX8";
 }
 
@@ -1066,7 +1060,6 @@ class VOP_DPP_Real <VOP_DPP_Pseudo ps, int EncodingFamily> :
   let hasSideEffects = ps.hasSideEffects;
 
   let Constraints     = ps.Constraints;
-  let DisableEncoding = ps.DisableEncoding;
 
   // Copy relevant pseudo op flags
   let isConvergent         = ps.isConvergent;
@@ -1079,7 +1072,6 @@ class VOP_DPP_Real <VOP_DPP_Pseudo ps, int EncodingFamily> :
   let UseNamedOperandTable = ps.UseNamedOperandTable;
   let DecoderNamespace     = ps.DecoderNamespace;
   let Constraints          = ps.Constraints;
-  let DisableEncoding      = ps.DisableEncoding;
   let TSFlags              = ps.TSFlags;
   let Uses                 = ps.Uses;
   let Defs                 = ps.Defs;
@@ -1109,7 +1101,6 @@ class VOP_DPP_Base <string OpName, VOPProfile P,
   let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP,
                                         AMDGPUAsmVariants.Disable);
   let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");
-  let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, "");
   let DecoderNamespace = "GFX8";
 }
 
@@ -1228,7 +1219,6 @@ class VOP_DPP8_Base<string OpName, VOPProfile P, dag InsDPP8 = P.InsDPP8, string
   let AssemblerPredicate = HasDPP8;
   let AsmVariantName = AMDGPUAsmVariants.DPP;
   let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");
-  let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, "");
 }
 
 class VOP_DPP8<string OpName, VOPProfile P> :
diff --git a/llvm/lib/Target/ARC/ARCInstrFormats.td b/llvm/lib/Target/ARC/ARCInstrFormats.td
index bd2ed0057617..0560bb1dc966 100644
--- a/llvm/lib/Target/ARC/ARCInstrFormats.td
+++ b/llvm/lib/Target/ARC/ARCInstrFormats.td
@@ -964,12 +964,10 @@ class F16_OP_U7<bit i, string asmstr> :
 
 // Special types for different instruction operands.
 def ccond : Operand<i32> {
-  let MIOperandInfo = (ops i32imm);
   let PrintMethod = "printPredicateOperand";
 }
 
 def brccond : Operand<i32> {
-  let MIOperandInfo = (ops i32imm);
   let PrintMethod = "printBRCCPredicateOperand";
 }
 
diff --git a/llvm/lib/Target/ARC/ARCInstrInfo.cpp b/llvm/lib/Target/ARC/ARCInstrInfo.cpp
index 8a89bdb546f3..05bcb3596ac4 100644
--- a/llvm/lib/Target/ARC/ARCInstrInfo.cpp
+++ b/llvm/lib/Target/ARC/ARCInstrInfo.cpp
@@ -44,7 +44,7 @@ enum TSFlagsConstants {
 void ARCInstrInfo::anchor() {}
 
 ARCInstrInfo::ARCInstrInfo(const ARCSubtarget &ST)
-    : ARCGenInstrInfo(ARC::ADJCALLSTACKDOWN, ARC::ADJCALLSTACKUP), RI(ST) {}
+    : ARCGenInstrInfo(ST, ARC::ADJCALLSTACKDOWN, ARC::ADJCALLSTACKUP), RI(ST) {}
 
 static bool isZeroImm(const MachineOperand &Op) {
   return Op.isImm() && Op.getImm() == 0;
diff --git a/llvm/lib/Target/ARC/ARCInstrInfo.td b/llvm/lib/Target/ARC/ARCInstrInfo.td
index f26b49119cab..8ff5f4a39ca7 100644
--- a/llvm/lib/Target/ARC/ARCInstrInfo.td
+++ b/llvm/lib/Target/ARC/ARCInstrInfo.td
@@ -18,7 +18,7 @@ include "ARCInstrFormats.td"
 
 // Operand for printing out a condition code.
 let PrintMethod = "printCCOperand" in
-  def CCOp : PredicateOperand<i32, (ops i32imm), (ops)>;
+  def CCOp : PredicateOperand<i32, (ops), (ops)>;
 
 // The "u6" operand of a RRU6-type instruction
 let PrintMethod = "printU6" in {
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 9e4dbecc16a8..5c35b3327c16 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -107,9 +107,9 @@ static const ARM_MLxEntry ARM_MLxTable[] = {
   { ARM::VMLSslfq,    ARM::VMULslfq,    ARM::VSUBfq,     false,  true  },
 };
 
-ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget& STI)
-  : ARMGenInstrInfo(ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP),
-    Subtarget(STI) {
+ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget &STI)
+    : ARMGenInstrInfo(STI, ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP),
+      Subtarget(STI) {
   for (unsigned i = 0, e = std::size(ARM_MLxTable); i != e; ++i) {
     if (!MLxEntryMap.insert(std::make_pair(ARM_MLxTable[i].MLxOpc, i)).second)
       llvm_unreachable("Duplicated entries?");
@@ -6730,7 +6730,7 @@ bool ARMPipelinerLoopInfo::tooMuchRegisterPressure(SwingSchedulerDAG &SSD,
         Register Reg = S.getReg();
         auto CIter = CrossIterationNeeds.find(Reg.id());
         if (CIter != CrossIterationNeeds.end()) {
-          auto Stg2 = SMS.stageScheduled(const_cast<SUnit *>(S.getSUnit()));
+          auto Stg2 = SMS.stageScheduled(S.getSUnit());
           assert(Stg2 <= Stg && "Data dependence upon earlier stage");
           if (Stg - Stg2 < MAX_STAGES)
             CIter->second.set(Stg - Stg2);
diff --git a/llvm/lib/Target/ARM/ARMBlockPlacement.cpp b/llvm/lib/Target/ARM/ARMBlockPlacement.cpp
index ec907995e3ab..3d8ebfeae81d 100644
--- a/llvm/lib/Target/ARM/ARMBlockPlacement.cpp
+++ b/llvm/lib/Target/ARM/ARMBlockPlacement.cpp
@@ -218,7 +218,7 @@ bool ARMBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
     return false;
   LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Running on " << MF.getName() << "\n");
   MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
-  TII = static_cast<const ARMBaseInstrInfo *>(ST.getInstrInfo());
+  TII = ST.getInstrInfo();
   BBUtils = std::make_unique<ARMBasicBlockUtils>(MF);
   MF.RenumberBlocks();
   BBUtils->computeAllBlockSizes();
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 12d2d678ff63..d4d3c7009527 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -529,56 +529,56 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
         const RTLIB::LibcallImpl Impl;
       } LibraryCalls[] = {
         // Single-precision floating-point arithmetic.
-        { RTLIB::ADD_F32, RTLIB::__addsf3vfp },
-        { RTLIB::SUB_F32, RTLIB::__subsf3vfp },
-        { RTLIB::MUL_F32, RTLIB::__mulsf3vfp },
-        { RTLIB::DIV_F32, RTLIB::__divsf3vfp },
+        { RTLIB::ADD_F32, RTLIB::impl___addsf3vfp },
+        { RTLIB::SUB_F32, RTLIB::impl___subsf3vfp },
+        { RTLIB::MUL_F32, RTLIB::impl___mulsf3vfp },
+        { RTLIB::DIV_F32, RTLIB::impl___divsf3vfp },
 
         // Double-precision floating-point arithmetic.
-        { RTLIB::ADD_F64, RTLIB::__adddf3vfp },
-        { RTLIB::SUB_F64, RTLIB::__subdf3vfp },
-        { RTLIB::MUL_F64, RTLIB::__muldf3vfp },
-        { RTLIB::DIV_F64, RTLIB::__divdf3vfp },
+        { RTLIB::ADD_F64, RTLIB::impl___adddf3vfp },
+        { RTLIB::SUB_F64, RTLIB::impl___subdf3vfp },
+        { RTLIB::MUL_F64, RTLIB::impl___muldf3vfp },
+        { RTLIB::DIV_F64, RTLIB::impl___divdf3vfp },
 
         // Single-precision comparisons.
-        { RTLIB::OEQ_F32, RTLIB::__eqsf2vfp },
-        { RTLIB::UNE_F32, RTLIB::__nesf2vfp },
-        { RTLIB::OLT_F32, RTLIB::__ltsf2vfp },
-        { RTLIB::OLE_F32, RTLIB::__lesf2vfp },
-        { RTLIB::OGE_F32, RTLIB::__gesf2vfp },
-        { RTLIB::OGT_F32, RTLIB::__gtsf2vfp },
-        { RTLIB::UO_F32,  RTLIB::__unordsf2vfp },
+        { RTLIB::OEQ_F32, RTLIB::impl___eqsf2vfp },
+        { RTLIB::UNE_F32, RTLIB::impl___nesf2vfp },
+        { RTLIB::OLT_F32, RTLIB::impl___ltsf2vfp },
+        { RTLIB::OLE_F32, RTLIB::impl___lesf2vfp },
+        { RTLIB::OGE_F32, RTLIB::impl___gesf2vfp },
+        { RTLIB::OGT_F32, RTLIB::impl___gtsf2vfp },
+        { RTLIB::UO_F32,  RTLIB::impl___unordsf2vfp },
 
         // Double-precision comparisons.
-        { RTLIB::OEQ_F64, RTLIB::__eqdf2vfp },
-        { RTLIB::UNE_F64, RTLIB::__nedf2vfp },
-        { RTLIB::OLT_F64, RTLIB::__ltdf2vfp },
-        { RTLIB::OLE_F64, RTLIB::__ledf2vfp },
-        { RTLIB::OGE_F64, RTLIB::__gedf2vfp },
-        { RTLIB::OGT_F64, RTLIB::__gtdf2vfp },
-        { RTLIB::UO_F64,  RTLIB::__unorddf2vfp },
+        { RTLIB::OEQ_F64, RTLIB::impl___eqdf2vfp },
+        { RTLIB::UNE_F64, RTLIB::impl___nedf2vfp },
+        { RTLIB::OLT_F64, RTLIB::impl___ltdf2vfp },
+        { RTLIB::OLE_F64, RTLIB::impl___ledf2vfp },
+        { RTLIB::OGE_F64, RTLIB::impl___gedf2vfp },
+        { RTLIB::OGT_F64, RTLIB::impl___gtdf2vfp },
+        { RTLIB::UO_F64,  RTLIB::impl___unorddf2vfp },
 
         // Floating-point to integer conversions.
         // i64 conversions are done via library routines even when generating VFP
         // instructions, so use the same ones.
-        { RTLIB::FPTOSINT_F64_I32, RTLIB::__fixdfsivfp },
-        { RTLIB::FPTOUINT_F64_I32, RTLIB::__fixunsdfsivfp },
-        { RTLIB::FPTOSINT_F32_I32, RTLIB::__fixsfsivfp },
-        { RTLIB::FPTOUINT_F32_I32, RTLIB::__fixunssfsivfp },
+        { RTLIB::FPTOSINT_F64_I32, RTLIB::impl___fixdfsivfp },
+        { RTLIB::FPTOUINT_F64_I32, RTLIB::impl___fixunsdfsivfp },
+        { RTLIB::FPTOSINT_F32_I32, RTLIB::impl___fixsfsivfp },
+        { RTLIB::FPTOUINT_F32_I32, RTLIB::impl___fixunssfsivfp },
 
         // Conversions between floating types.
-        { RTLIB::FPROUND_F64_F32, RTLIB::__truncdfsf2vfp },
-        { RTLIB::FPEXT_F32_F64,   RTLIB::__extendsfdf2vfp },
+        { RTLIB::FPROUND_F64_F32, RTLIB::impl___truncdfsf2vfp },
+        { RTLIB::FPEXT_F32_F64,   RTLIB::impl___extendsfdf2vfp },
 
         // Integer to floating-point conversions.
         // i64 conversions are done via library routines even when generating VFP
         // instructions, so use the same ones.
         // FIXME: There appears to be some naming inconsistency in ARM libgcc:
         // e.g., __floatunsidf vs. __floatunssidfvfp.
-        { RTLIB::SINTTOFP_I32_F64, RTLIB::__floatsidfvfp },
-        { RTLIB::UINTTOFP_I32_F64, RTLIB::__floatunssidfvfp },
-        { RTLIB::SINTTOFP_I32_F32, RTLIB::__floatsisfvfp },
-        { RTLIB::UINTTOFP_I32_F32, RTLIB::__floatunssisfvfp },
+        { RTLIB::SINTTOFP_I32_F64, RTLIB::impl___floatsidfvfp },
+        { RTLIB::UINTTOFP_I32_F64, RTLIB::impl___floatunssidfvfp },
+        { RTLIB::SINTTOFP_I32_F32, RTLIB::impl___floatsisfvfp },
+        { RTLIB::UINTTOFP_I32_F32, RTLIB::impl___floatunssisfvfp },
       };
       // clang-format on
 
@@ -3403,7 +3403,7 @@ SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
   // position-independent addressing modes.
   if (Subtarget->genExecuteOnly()) {
     auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
-    auto T = const_cast<Type*>(CP->getType());
+    auto *T = CP->getType();
     auto C = const_cast<Constant*>(CP->getConstVal());
     auto M = DAG.getMachineFunction().getFunction().getParent();
     auto GV = new GlobalVariable(
@@ -5570,7 +5570,7 @@ static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
   llvm_unreachable("Unknown VFP cmp argument!");
 }
 
-/// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some
+/// OptimizeVFPBrcond - With nnan, it's legal to optimize some
 /// f32 and even f64 comparisons to integer ones.
 SDValue
 ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
@@ -5712,9 +5712,12 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
     return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, Cmp);
   }
 
-  if (getTargetMachine().Options.UnsafeFPMath &&
-      (CC == ISD::SETEQ || CC == ISD::SETOEQ ||
-       CC == ISD::SETNE || CC == ISD::SETUNE)) {
+  SDNodeFlags Flags = Op->getFlags();
+  if ((getTargetMachine().Options.UnsafeFPMath || Flags.hasNoNaNs()) &&
+      (DAG.getDenormalMode(MVT::f32) == DenormalMode::getIEEE() &&
+       DAG.getDenormalMode(MVT::f64) == DenormalMode::getIEEE()) &&
+      (CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETNE ||
+       CC == ISD::SETUNE)) {
     if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
       return Result;
   }
@@ -10539,19 +10542,11 @@ SDValue ARMTargetLowering::LowerCMP(SDValue Op, SelectionDAG &DAG) const {
   }
 
   // Generate the operation with flags
-  SDValue OpWithFlags;
-  if (Opcode == ARMISD::ADDC) {
-    // Use ADDC: LHS + RHS (where RHS was 0 - X, now X)
-    OpWithFlags = DAG.getNode(ARMISD::ADDC, dl,
-                              DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
-  } else {
-    // Use ARMISD::SUBC to generate SUBS instruction (subtract with flags)
-    OpWithFlags = DAG.getNode(ARMISD::SUBC, dl,
-                              DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
-  }
+  SDValue OpWithFlags =
+      DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
 
-  SDValue OpResult = OpWithFlags.getValue(0); // The operation result
-  SDValue Flags = OpWithFlags.getValue(1);    // The flags
+  SDValue OpResult = OpWithFlags.getValue(0);
+  SDValue Flags = OpWithFlags.getValue(1);
 
   // Constants for conditional moves
   SDValue One = DAG.getConstant(1, dl, MVT::i32);
@@ -20073,6 +20068,29 @@ void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     Known = KnownOp0.intersectWith(KnownOp1);
     break;
   }
+  case ARMISD::VORRIMM:
+  case ARMISD::VBICIMM: {
+    unsigned Encoded = Op.getConstantOperandVal(1);
+    unsigned DecEltBits = 0;
+    uint64_t DecodedVal = ARM_AM::decodeVMOVModImm(Encoded, DecEltBits);
+
+    unsigned EltBits = Op.getScalarValueSizeInBits();
+    if (EltBits != DecEltBits) {
+      // Be conservative: only update Known when EltBits == DecEltBits.
+      // This is believed to always be true for VORRIMM/VBICIMM today, but if
+      // that changes in the future, doing nothing here is safer than risking
+      // subtle bugs.
+      break;
+    }
+
+    KnownBits KnownLHS = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
+    bool IsVORR = Op.getOpcode() == ARMISD::VORRIMM;
+    APInt Imm(DecEltBits, DecodedVal);
+
+    Known.One = IsVORR ? (KnownLHS.One | Imm) : (KnownLHS.One & ~Imm);
+    Known.Zero = IsVORR ? (KnownLHS.Zero & ~Imm) : (KnownLHS.Zero | Imm);
+    break;
+  }
   }
 }
 
@@ -20200,37 +20218,6 @@ bool ARMTargetLowering::SimplifyDemandedBitsForTargetNode(
 //                           ARM Inline Assembly Support
 //===----------------------------------------------------------------------===//
 
-bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
-  // Looking for "rev" which is V6+.
-  if (!Subtarget->hasV6Ops())
-    return false;
-
-  InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
-  StringRef AsmStr = IA->getAsmString();
-  SmallVector<StringRef, 4> AsmPieces;
-  SplitString(AsmStr, AsmPieces, ";\n");
-
-  switch (AsmPieces.size()) {
-  default: return false;
-  case 1:
-    AsmStr = AsmPieces[0];
-    AsmPieces.clear();
-    SplitString(AsmStr, AsmPieces, " \t,");
-
-    // rev $0, $1
-    if (AsmPieces.size() == 3 && AsmPieces[0] == "rev" &&
-        AsmPieces[1] == "$0" && AsmPieces[2] == "$1" &&
-        IA->getConstraintString().starts_with("=l,l")) {
-      IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
-      if (Ty && Ty->getBitWidth() == 32)
-        return IntrinsicLowering::LowerToByteSwap(CI);
-    }
-    break;
-  }
-
-  return false;
-}
-
 const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
   // At this point, we have to lower this constraint to something else, so we
   // lower it to an "r" or "w". However, by doing this we will force the result
@@ -21379,12 +21366,25 @@ bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
   return false;
 }
 
+bool ARMTargetLowering::canCreateUndefOrPoisonForTargetNode(
+    SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
+    bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
+  unsigned Opcode = Op.getOpcode();
+  switch (Opcode) {
+  case ARMISD::VORRIMM:
+  case ARMISD::VBICIMM:
+    return false;
+  }
+  return TargetLowering::canCreateUndefOrPoisonForTargetNode(
+      Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
+}
+
 bool ARMTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
-  return Subtarget->hasV6T2Ops();
+  return Subtarget->hasV5TOps() && !Subtarget->isThumb1Only();
 }
 
 bool ARMTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
-  return Subtarget->hasV6T2Ops();
+  return Subtarget->hasV5TOps() && !Subtarget->isThumb1Only();
 }
 
 bool ARMTargetLowering::isMaskAndCmp0FoldingBeneficial(
@@ -21706,13 +21706,15 @@ bool ARMTargetLowering::lowerInterleavedLoad(
 bool ARMTargetLowering::lowerInterleavedStore(Instruction *Store,
                                               Value *LaneMask,
                                               ShuffleVectorInst *SVI,
-                                              unsigned Factor) const {
+                                              unsigned Factor,
+                                              const APInt &GapMask) const {
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
   auto *SI = dyn_cast<StoreInst>(Store);
   if (!SI)
     return false;
-  assert(!LaneMask && "Unexpected mask on store");
+  assert(!LaneMask && GapMask.popcount() == Factor &&
+         "Unexpected mask on store");
 
   auto *VecTy = cast<FixedVectorType>(SVI->getType());
   assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 0185c8ddd492..ccf6d509313b 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -534,8 +534,6 @@ class VectorType;
                                       const APInt &DemandedElts,
                                       TargetLoweringOpt &TLO) const override;
 
-    bool ExpandInlineAsm(CallInst *CI) const override;
-
     ConstraintType getConstraintType(StringRef Constraint) const override;
 
     /// Examine constraint string and operand type and determine a weight value.
@@ -688,8 +686,8 @@ class VectorType;
                               ArrayRef<unsigned> Indices, unsigned Factor,
                               const APInt &GapMask) const override;
     bool lowerInterleavedStore(Instruction *Store, Value *Mask,
-                               ShuffleVectorInst *SVI,
-                               unsigned Factor) const override;
+                               ShuffleVectorInst *SVI, unsigned Factor,
+                               const APInt &GapMask) const override;
 
     bool shouldInsertFencesForAtomic(const Instruction *I) const override;
     TargetLoweringBase::AtomicExpansionKind
@@ -709,6 +707,10 @@ class VectorType;
     bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
                                    unsigned &Cost) const override;
 
+    bool canCreateUndefOrPoisonForTargetNode(
+        SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
+        bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override;
+
     bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
                           const MachineFunction &MF) const override {
       // Do not merge to larger than i32.
diff --git a/llvm/lib/Target/ARM/ARMInstrFormats.td b/llvm/lib/Target/ARM/ARMInstrFormats.td
index 9eb911406914..e50740f7d57c 100644
--- a/llvm/lib/Target/ARM/ARMInstrFormats.td
+++ b/llvm/lib/Target/ARM/ARMInstrFormats.td
@@ -160,7 +160,7 @@ def CondCodeOperand : AsmOperandClass {
   let DefaultMethod = "defaultCondCodeOp";
   let IsOptional = true;
 }
-def pred : PredicateOperand<OtherVT, (ops i32imm, i32imm),
+def pred : PredicateOperand<OtherVT, (ops i32imm, CCR),
                                      (ops (i32 14), (i32 zero_reg))> {
   let PrintMethod = "printPredicateOperand";
   let ParserMatchClass = CondCodeOperand;
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td
index 934ec52c6f1e..bdb16d7d3926 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -164,10 +164,9 @@ def ARMWrapperPIC    : SDNode<"ARMISD::WrapperPIC",  SDTIntUnaryOp>;
 def ARMWrapperJT     : SDNode<"ARMISD::WrapperJT",   SDTIntUnaryOp>;
 
 def ARMcallseq_start : SDNode<"ISD::CALLSEQ_START", SDT_ARMCallSeqStart,
-                              [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>;
+                              [SDNPHasChain, SDNPOutGlue]>;
 def ARMcallseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_ARMCallSeqEnd,
-                              [SDNPHasChain, SDNPSideEffect,
-                               SDNPOptInGlue, SDNPOutGlue]>;
+                              [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 def ARMcopystructbyval : SDNode<"ARMISD::COPY_STRUCT_BYVAL" ,
                                 SDT_ARMStructByVal,
                                 [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
@@ -766,7 +765,6 @@ class MVEVectorIndexOperand<int NumLanes> : AsmOperandClass {
 class MVEVectorIndex<int NumLanes> : Operand<i32> {
   let PrintMethod = "printVectorIndex";
   let ParserMatchClass = MVEVectorIndexOperand<NumLanes>;
-  let MIOperandInfo = (ops i32imm);
 }
 
 // shift_imm: An integer that encodes a shift amount and the type of shift
@@ -1182,7 +1180,6 @@ def PostIdxImm8AsmOperand : AsmOperandClass { let Name = "PostIdxImm8"; }
 def postidx_imm8 : MemOperand {
   let PrintMethod = "printPostIdxImm8Operand";
   let ParserMatchClass = PostIdxImm8AsmOperand;
-  let MIOperandInfo = (ops i32imm);
 }
 
 // postidx_imm8s4 := +/- [0,1020]
@@ -6448,7 +6445,7 @@ def : ARMInstAlias<"neg${s}${p} $Rd, $Rm",
                    (RSBri GPR:$Rd, GPR:$Rm, 0, pred:$p, cc_out:$s)>;
 
 // Pre-v6, 'mov r0, r0' was used as a NOP encoding.
-def : InstAlias<"nop${p}", (MOVr R0, R0, pred:$p, zero_reg), 0>,
+def : InstAlias<"nop${p}", (MOVr R0, R0, pred:$p, (cc_out zero_reg)), 0>,
          Requires<[IsARM, NoV6]>;
 
 // MUL/UMLAL/SMLAL/UMULL/SMULL are available on all arches, but
diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td
index 7485ef569445..37f0103363b9 100644
--- a/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -95,28 +95,24 @@ def VectorIndex8 : Operand<i32>, ImmLeaf<i32, [{
 }]> {
   let ParserMatchClass = VectorIndex8Operand;
   let PrintMethod = "printVectorIndex";
-  let MIOperandInfo = (ops i32imm);
 }
 def VectorIndex16 : Operand<i32>, ImmLeaf<i32, [{
   return ((uint64_t)Imm) < 4;
 }]> {
   let ParserMatchClass = VectorIndex16Operand;
   let PrintMethod = "printVectorIndex";
-  let MIOperandInfo = (ops i32imm);
 }
 def VectorIndex32 : Operand<i32>, ImmLeaf<i32, [{
   return ((uint64_t)Imm) < 2;
 }]> {
   let ParserMatchClass = VectorIndex32Operand;
   let PrintMethod = "printVectorIndex";
-  let MIOperandInfo = (ops i32imm);
 }
 def VectorIndex64 : Operand<i32>, ImmLeaf<i32, [{
   return ((uint64_t)Imm) < 1;
 }]> {
   let ParserMatchClass = VectorIndex64Operand;
   let PrintMethod = "printVectorIndex";
-  let MIOperandInfo = (ops i32imm);
 }
 
 // Register list of one D register.
diff --git a/llvm/lib/Target/ARM/ARMInstrThumb.td b/llvm/lib/Target/ARM/ARMInstrThumb.td
index e38cafdf55c4..0c5ea3e0fa8d 100644
--- a/llvm/lib/Target/ARM/ARMInstrThumb.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb.td
@@ -1209,8 +1209,9 @@ def tMOVi8 : T1sI<(outs tGPR:$Rd), (ins imm0_255_expr:$imm8), IIC_iMOVi,
 }
 // Because we have an explicit tMOVSr below, we need an alias to handle
 // the immediate "movs" form here. Blech.
-def : tInstAlias <"movs $Rdn, $imm8",
-                 (tMOVi8 tGPR:$Rdn, CPSR, imm0_255_expr:$imm8, 14, zero_reg)>;
+def : tInstAlias<"movs $Rdn, $imm8",
+                 (tMOVi8 tGPR:$Rdn, (s_cc_out CPSR),
+                         imm0_255_expr:$imm8, (pred 14, zero_reg))>;
 
 // A7-73: MOV(2) - mov setting flag.
 
@@ -1764,7 +1765,8 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
 
 // In Thumb1, "nop" is encoded as a "mov r8, r8". Technically, the bf00
 // encoding is available on ARMv6K, but we don't differentiate that finely.
-def : InstAlias<"nop", (tMOVr R8, R8, 14, zero_reg), 0>, Requires<[IsThumb, IsThumb1Only]>;
+def : InstAlias<"nop", (tMOVr R8, R8, (pred 14, zero_reg)), 0>,
+      Requires<[IsThumb, IsThumb1Only]>;
 
 
 // "neg" is and alias for "rsb rd, rn, #0"
diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td
index 8f56fb0938dd..c00d616670b5 100644
--- a/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -2222,11 +2222,11 @@ def t2MOVr : T2sTwoReg<(outs GPRnopc:$Rd), (ins GPRnopc:$Rm), IIC_iMOVr,
   let Inst{7-4} = 0b0000;
 }
 def : t2InstAlias<"mov${p}.w $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPRnopc:$Rm,
-                                                pred:$p, zero_reg)>;
+                                                pred:$p, (cc_out zero_reg))>;
 def : t2InstAlias<"movs${p}.w $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPRnopc:$Rm,
-                                                 pred:$p, CPSR)>;
+                                                 pred:$p, (cc_out CPSR))>;
 def : t2InstAlias<"movs${p} $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPRnopc:$Rm,
-                                               pred:$p, CPSR)>;
+                                               pred:$p, (cc_out CPSR))>;
 
 // AddedComplexity to ensure isel tries t2MOVi before t2MOVi16.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1,
@@ -2244,14 +2244,14 @@ def t2MOVi : T2sOneRegImm<(outs rGPR:$Rd), (ins t2_so_imm:$imm), IIC_iMOVi,
 // cc_out is handled as part of the explicit mnemonic in the parser for 'mov'.
 // Use aliases to get that to play nice here.
 def : t2InstAlias<"movs${p}.w $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
-                                                pred:$p, CPSR)>;
+                                                pred:$p, (cc_out CPSR))>;
 def : t2InstAlias<"movs${p} $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
-                                                pred:$p, CPSR)>;
+                                                pred:$p, (cc_out CPSR))>;
 
 def : t2InstAlias<"mov${p}.w $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
-                                                 pred:$p, zero_reg)>;
+                                                 pred:$p, (cc_out zero_reg))>;
 def : t2InstAlias<"mov${p} $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
-                                               pred:$p, zero_reg)>;
+                                               pred:$p, (cc_out zero_reg))>;
 
 let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
 def t2MOVi16 : T2I<(outs rGPR:$Rd), (ins imm0_65535_expr:$imm), IIC_iMOVi,
@@ -5122,8 +5122,10 @@ def : InstAlias<"isb${p}.w", (t2ISB 0xf, pred:$p), 0>, Requires<[HasDB]>;
 
 // Non-predicable aliases of a predicable DSB: the predicate is (14, zero_reg) where
 // 14 = AL (always execute) and zero_reg = "instruction doesn't read the CPSR".
-def : InstAlias<"ssbb", (t2DSB 0x0, 14, zero_reg), 1>, Requires<[HasDB, IsThumb2]>;
-def : InstAlias<"pssbb", (t2DSB 0x4, 14, zero_reg), 1>, Requires<[HasDB, IsThumb2]>;
+def : InstAlias<"ssbb", (t2DSB 0x0, (pred 14, zero_reg)), 1>,
+      Requires<[HasDB, IsThumb2]>;
+def : InstAlias<"pssbb", (t2DSB 0x4, (pred 14, zero_reg)), 1>,
+      Requires<[HasDB, IsThumb2]>;
 
 // Armv8-R 'Data Full Barrier'
 def : InstAlias<"dfb${p}", (t2DSB 0xc, pred:$p), 1>, Requires<[HasDFB]>;
@@ -5340,7 +5342,8 @@ def : t2InstAlias<"sxth${p} $Rd, $Rm$rot",
 // "mov Rd, t2_so_imm_not" can be handled via "mvn" in assembly, just like
 // for isel.
 def : t2InstSubst<"mov${p} $Rd, $imm",
-                  (t2MVNi rGPR:$Rd, t2_so_imm_not:$imm, pred:$p, zero_reg)>;
+                  (t2MVNi rGPR:$Rd, t2_so_imm_not:$imm, pred:$p,
+                          (cc_out zero_reg))>;
 def : t2InstSubst<"mvn${s}${p} $Rd, $imm",
                   (t2MOVi rGPR:$Rd, t2_so_imm_not:$imm, pred:$p, s_cc_out:$s)>;
 // Same for AND <--> BIC
diff --git a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
index fc12f050fa5a..cdff649ecfa5 100644
--- a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -206,7 +206,7 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) : ST(ST) {
 
   getActionDefinitionsBuilder({G_FREM, G_FPOW}).libcallFor({s32, s64});
 
-  if (ST.hasV5TOps()) {
+  if (ST.hasV5TOps() && !ST.isThumb1Only()) {
     getActionDefinitionsBuilder(G_CTLZ)
         .legalFor({s32, s32})
         .clampScalar(1, s32, s32)
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 6b2854171c81..9b250e6cac3a 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1916,9 +1916,11 @@ InstructionCost ARMTTIImpl::getExtendedReductionCost(
 }
 
 InstructionCost
-ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
-                                   VectorType *ValTy,
+ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
+                                   Type *ResTy, VectorType *ValTy,
                                    TTI::TargetCostKind CostKind) const {
+  if (RedOpcode != Instruction::Add)
+    return InstructionCost::getInvalid(CostKind);
   EVT ValVT = TLI->getValueType(DL, ValTy);
   EVT ResVT = TLI->getValueType(DL, ResTy);
 
@@ -1939,7 +1941,8 @@ ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
       return ST->getMVEVectorCostFactor(CostKind) * LT.first;
   }
 
-  return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, ValTy, CostKind);
+  return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, ValTy,
+                                       CostKind);
 }
 
 InstructionCost
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index cdd8bcb9f741..0810c5532ed9 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -299,7 +299,8 @@ public:
                            VectorType *ValTy, std::optional<FastMathFlags> FMF,
                            TTI::TargetCostKind CostKind) const override;
   InstructionCost
-  getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *ValTy,
+  getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy,
+                         VectorType *ValTy,
                          TTI::TargetCostKind CostKind) const override;
 
   InstructionCost
diff --git a/llvm/lib/Target/ARM/CMakeLists.txt b/llvm/lib/Target/ARM/CMakeLists.txt
index a39629bd8aeb..fa778cad4af8 100644
--- a/llvm/lib/Target/ARM/CMakeLists.txt
+++ b/llvm/lib/Target/ARM/CMakeLists.txt
@@ -6,7 +6,8 @@ tablegen(LLVM ARMGenAsmMatcher.inc -gen-asm-matcher)
 tablegen(LLVM ARMGenAsmWriter.inc -gen-asm-writer)
 tablegen(LLVM ARMGenCallingConv.inc -gen-callingconv)
 tablegen(LLVM ARMGenDAGISel.inc -gen-dag-isel)
-tablegen(LLVM ARMGenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM ARMGenDisassemblerTables.inc -gen-disassembler
+              -ignore-non-decodable-operands)
 tablegen(LLVM ARMGenFastISel.inc -gen-fast-isel)
 tablegen(LLVM ARMGenGlobalISel.inc -gen-global-isel)
 tablegen(LLVM ARMGenInstrInfo.inc -gen-instr-info)
diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 19fa03cdc668..1d19bc89ccf9 100644
--- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -161,676 +161,13 @@ private:
 
 // Forward declare these because the autogenerated code will reference them.
 // Definitions are further down.
-static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder);
-static DecodeStatus DecodeCLRMGPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder);
-static DecodeStatus DecodetGPROddRegisterClass(MCInst &Inst, unsigned RegNo,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder);
-static DecodeStatus DecodetGPREvenRegisterClass(MCInst &Inst, unsigned RegNo,
-                                                uint64_t Address,
-                                                const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeGPRwithAPSR_NZCVnospRegisterClass(MCInst &Inst, unsigned RegNo,
-                                        uint64_t Address,
-                                        const MCDisassembler *Decoder);
-static DecodeStatus DecodeGPRnopcRegisterClass(MCInst &Inst, unsigned RegNo,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder);
-static DecodeStatus DecodeGPRnospRegisterClass(MCInst &Inst, unsigned RegNo,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeGPRwithAPSRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
-                               const MCDisassembler *Decoder);
-static DecodeStatus DecodeGPRwithZRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                                 uint64_t Address,
-                                                 const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeGPRwithZRnospRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
-                                 const MCDisassembler *Decoder);
-static DecodeStatus DecodetGPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                            uint64_t Address,
-                                            const MCDisassembler *Decoder);
-static DecodeStatus DecodetcGPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const MCDisassembler *Decoder);
-static DecodeStatus DecoderGPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                            uint64_t Address,
-                                            const MCDisassembler *Decoder);
-static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeGPRPairnospRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
-                               const MCDisassembler *Decoder);
-static DecodeStatus DecodeGPRspRegisterClass(MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const MCDisassembler *Decoder);
-static DecodeStatus DecodeHPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder);
-static DecodeStatus DecodeSPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder);
-static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder);
-static DecodeStatus DecodeDPR_8RegisterClass(MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const MCDisassembler *Decoder);
-static DecodeStatus DecodeSPR_8RegisterClass(MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const MCDisassembler *Decoder);
-static DecodeStatus DecodeDPR_VFP2RegisterClass(MCInst &Inst, unsigned RegNo,
-                                                uint64_t Address,
-                                                const MCDisassembler *Decoder);
-static DecodeStatus DecodeQPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder);
-static DecodeStatus DecodeMQPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                            uint64_t Address,
-                                            const MCDisassembler *Decoder);
-static DecodeStatus DecodeMQQPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const MCDisassembler *Decoder);
-static DecodeStatus DecodeMQQQQPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder);
-static DecodeStatus DecodeDPairRegisterClass(MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeDPairSpacedRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
-                               const MCDisassembler *Decoder);
-
-static DecodeStatus DecodePredicateOperand(MCInst &Inst, unsigned Val,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder);
-static DecodeStatus DecodeCCOutOperand(MCInst &Inst, unsigned Val,
-                                       uint64_t Address,
-                                       const MCDisassembler *Decoder);
-static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Val,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder);
-static DecodeStatus DecodeSPRRegListOperand(MCInst &Inst, unsigned Val,
-                                            uint64_t Address,
-                                            const MCDisassembler *Decoder);
-static DecodeStatus DecodeDPRRegListOperand(MCInst &Inst, unsigned Val,
-                                            uint64_t Address,
-                                            const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeBitfieldMaskOperand(MCInst &Inst, unsigned Insn,
-                                              uint64_t Address,
-                                              const MCDisassembler *Decoder);
-static DecodeStatus DecodeCopMemInstruction(MCInst &Inst, unsigned Insn,
-                                            uint64_t Address,
-                                            const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeAddrMode2IdxInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
-                              const MCDisassembler *Decoder);
-static DecodeStatus DecodeSORegMemOperand(MCInst &Inst, unsigned Insn,
-                                          uint64_t Address,
-                                          const MCDisassembler *Decoder);
-static DecodeStatus DecodeAddrMode3Instruction(MCInst &Inst, unsigned Insn,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder);
-static DecodeStatus DecodeTSBInstruction(MCInst &Inst, unsigned Insn,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder);
-static DecodeStatus DecodeSORegImmOperand(MCInst &Inst, unsigned Insn,
-                                          uint64_t Address,
-                                          const MCDisassembler *Decoder);
-static DecodeStatus DecodeSORegRegOperand(MCInst &Inst, unsigned Insn,
-                                          uint64_t Address,
-                                          const MCDisassembler *Decoder);
-
-static DecodeStatus
-DecodeMemMultipleWritebackInstruction(MCInst &Inst, unsigned Insn,
-                                      uint64_t Adddress,
-                                      const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2MOVTWInstruction(MCInst &Inst, unsigned Insn,
-                                             uint64_t Address,
-                                             const MCDisassembler *Decoder);
-static DecodeStatus DecodeArmMOVTWInstruction(MCInst &Inst, unsigned Insn,
-                                              uint64_t Address,
-                                              const MCDisassembler *Decoder);
-static DecodeStatus DecodeSMLAInstruction(MCInst &Inst, unsigned Insn,
-                                          uint64_t Address,
-                                          const MCDisassembler *Decoder);
-static DecodeStatus DecodeHINTInstruction(MCInst &Inst, unsigned Insn,
-                                          uint64_t Address,
-                                          const MCDisassembler *Decoder);
-static DecodeStatus DecodeCPSInstruction(MCInst &Inst, unsigned Insn,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder);
-static DecodeStatus DecodeTSTInstruction(MCInst &Inst, unsigned Insn,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder);
-static DecodeStatus DecodeSETPANInstruction(MCInst &Inst, unsigned Insn,
-                                            uint64_t Address,
-                                            const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2CPSInstruction(MCInst &Inst, unsigned Insn,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2HintSpaceInstruction(MCInst &Inst, unsigned Insn,
-                                                 uint64_t Address,
-                                                 const MCDisassembler *Decoder);
-static DecodeStatus DecodeAddrModeImm12Operand(MCInst &Inst, unsigned Val,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder);
-static DecodeStatus DecodeAddrMode5Operand(MCInst &Inst, unsigned Val,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder);
-static DecodeStatus DecodeAddrMode5FP16Operand(MCInst &Inst, unsigned Val,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder);
-static DecodeStatus DecodeAddrMode7Operand(MCInst &Inst, unsigned Val,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2BInstruction(MCInst &Inst, unsigned Insn,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder);
-static DecodeStatus DecodeBranchImmInstruction(MCInst &Inst, unsigned Insn,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder);
-static DecodeStatus DecodeAddrMode6Operand(MCInst &Inst, unsigned Val,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder);
-static DecodeStatus DecodeVLDST1Instruction(MCInst &Inst, unsigned Val,
-                                            uint64_t Address,
-                                            const MCDisassembler *Decoder);
-static DecodeStatus DecodeVLDST2Instruction(MCInst &Inst, unsigned Val,
-                                            uint64_t Address,
-                                            const MCDisassembler *Decoder);
-static DecodeStatus DecodeVLDST3Instruction(MCInst &Inst, unsigned Val,
-                                            uint64_t Address,
-                                            const MCDisassembler *Decoder);
-static DecodeStatus DecodeVLDST4Instruction(MCInst &Inst, unsigned Val,
-                                            uint64_t Address,
-                                            const MCDisassembler *Decoder);
-static DecodeStatus DecodeVLDInstruction(MCInst &Inst, unsigned Val,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder);
-static DecodeStatus DecodeVSTInstruction(MCInst &Inst, unsigned Val,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder);
-static DecodeStatus DecodeVLD1DupInstruction(MCInst &Inst, unsigned Val,
-                                             uint64_t Address,
-                                             const MCDisassembler *Decoder);
-static DecodeStatus DecodeVLD2DupInstruction(MCInst &Inst, unsigned Val,
-                                             uint64_t Address,
-                                             const MCDisassembler *Decoder);
-static DecodeStatus DecodeVLD3DupInstruction(MCInst &Inst, unsigned Val,
-                                             uint64_t Address,
-                                             const MCDisassembler *Decoder);
-static DecodeStatus DecodeVLD4DupInstruction(MCInst &Inst, unsigned Val,
-                                             uint64_t Address,
-                                             const MCDisassembler *Decoder);
-static DecodeStatus DecodeVMOVModImmInstruction(MCInst &Inst, unsigned Val,
-                                                uint64_t Address,
-                                                const MCDisassembler *Decoder);
-static DecodeStatus DecodeMVEModImmInstruction(MCInst &Inst, unsigned Val,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder);
-static DecodeStatus DecodeMVEVADCInstruction(MCInst &Inst, unsigned Insn,
-                                             uint64_t Address,
-                                             const MCDisassembler *Decoder);
-static DecodeStatus DecodeVSHLMaxInstruction(MCInst &Inst, unsigned Val,
-                                             uint64_t Address,
-                                             const MCDisassembler *Decoder);
-static DecodeStatus DecodeShiftRight8Imm(MCInst &Inst, unsigned Val,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder);
-static DecodeStatus DecodeShiftRight16Imm(MCInst &Inst, unsigned Val,
-                                          uint64_t Address,
-                                          const MCDisassembler *Decoder);
-static DecodeStatus DecodeShiftRight32Imm(MCInst &Inst, unsigned Val,
-                                          uint64_t Address,
-                                          const MCDisassembler *Decoder);
-static DecodeStatus DecodeShiftRight64Imm(MCInst &Inst, unsigned Val,
-                                          uint64_t Address,
-                                          const MCDisassembler *Decoder);
-static DecodeStatus DecodeTBLInstruction(MCInst &Inst, unsigned Insn,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder);
-static DecodeStatus DecodePostIdxReg(MCInst &Inst, unsigned Insn,
-                                     uint64_t Address,
-                                     const MCDisassembler *Decoder);
-static DecodeStatus DecodeMveAddrModeRQ(MCInst &Inst, unsigned Insn,
-                                        uint64_t Address,
-                                        const MCDisassembler *Decoder);
-template <int shift>
-static DecodeStatus DecodeMveAddrModeQ(MCInst &Inst, unsigned Insn,
-                                       uint64_t Address,
-                                       const MCDisassembler *Decoder);
-static DecodeStatus DecodeCoprocessor(MCInst &Inst, unsigned Insn,
-                                      uint64_t Address,
-                                      const MCDisassembler *Decoder);
-static DecodeStatus DecodeMemBarrierOption(MCInst &Inst, unsigned Insn,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder);
-static DecodeStatus DecodeInstSyncBarrierOption(MCInst &Inst, unsigned Insn,
-                                                uint64_t Address,
-                                                const MCDisassembler *Decoder);
-static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                  const MCDisassembler *Decoder);
-static DecodeStatus DecodeBankedReg(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address,
-                                    const MCDisassembler *Decoder);
-static DecodeStatus DecodeDoubleRegLoad(MCInst &Inst, unsigned Insn,
-                                        uint64_t Address,
-                                        const MCDisassembler *Decoder);
-static DecodeStatus DecodeDoubleRegStore(MCInst &Inst, unsigned Insn,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder);
-static DecodeStatus DecodeLDRPreImm(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address,
-                                    const MCDisassembler *Decoder);
-static DecodeStatus DecodeLDRPreReg(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address,
-                                    const MCDisassembler *Decoder);
-static DecodeStatus DecodeSTRPreImm(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address,
-                                    const MCDisassembler *Decoder);
-static DecodeStatus DecodeSTRPreReg(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address,
-                                    const MCDisassembler *Decoder);
-static DecodeStatus DecodeVLD1LN(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                 const MCDisassembler *Decoder);
-static DecodeStatus DecodeVLD2LN(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                 const MCDisassembler *Decoder);
-static DecodeStatus DecodeVLD3LN(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                 const MCDisassembler *Decoder);
-static DecodeStatus DecodeVLD4LN(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                 const MCDisassembler *Decoder);
-static DecodeStatus DecodeVST1LN(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                 const MCDisassembler *Decoder);
-static DecodeStatus DecodeVST2LN(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                 const MCDisassembler *Decoder);
-static DecodeStatus DecodeVST3LN(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                 const MCDisassembler *Decoder);
-static DecodeStatus DecodeVST4LN(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                 const MCDisassembler *Decoder);
-static DecodeStatus DecodeVMOVSRR(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                  const MCDisassembler *Decoder);
-static DecodeStatus DecodeVMOVRRS(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                  const MCDisassembler *Decoder);
-static DecodeStatus DecodeSwap(MCInst &Inst, unsigned Insn, uint64_t Address,
-                               const MCDisassembler *Decoder);
-static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                const MCDisassembler *Decoder);
-static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                const MCDisassembler *Decoder);
-static DecodeStatus DecodeVCVTImmOperand(MCInst &Inst, unsigned Insn,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeNEONComplexLane64Instruction(MCInst &Inst, unsigned Val, uint64_t Address,
-                                   const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeThumbAddSpecialReg(MCInst &Inst, uint16_t Insn,
-                                             uint64_t Address,
-                                             const MCDisassembler *Decoder);
-static DecodeStatus DecodeThumbBROperand(MCInst &Inst, unsigned Val,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2BROperand(MCInst &Inst, unsigned Val,
-                                      uint64_t Address,
-                                      const MCDisassembler *Decoder);
-static DecodeStatus DecodeThumbCmpBROperand(MCInst &Inst, unsigned Val,
-                                            uint64_t Address,
-                                            const MCDisassembler *Decoder);
-static DecodeStatus DecodeThumbAddrModeRR(MCInst &Inst, unsigned Val,
-                                          uint64_t Address,
-                                          const MCDisassembler *Decoder);
-static DecodeStatus DecodeThumbAddrModeIS(MCInst &Inst, unsigned Val,
-                                          uint64_t Address,
-                                          const MCDisassembler *Decoder);
-static DecodeStatus DecodeThumbAddrModePC(MCInst &Inst, unsigned Val,
-                                          uint64_t Address,
-                                          const MCDisassembler *Decoder);
-static DecodeStatus DecodeThumbAddrModeSP(MCInst &Inst, unsigned Val,
-                                          uint64_t Address,
-                                          const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2AddrModeSOReg(MCInst &Inst, unsigned Val,
-                                          uint64_t Address,
-                                          const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2LoadShift(MCInst &Inst, unsigned Val,
-                                      uint64_t Address,
-                                      const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2LoadImm8(MCInst &Inst, unsigned Insn,
-                                     uint64_t Address,
-                                     const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2LoadImm12(MCInst &Inst, unsigned Insn,
-                                      uint64_t Address,
-                                      const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2LoadT(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                  const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2LoadLabel(MCInst &Inst, unsigned Insn,
-                                      uint64_t Address,
-                                      const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2Imm8S4(MCInst &Inst, unsigned Val, uint64_t Address,
-                                   const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2Imm7S4(MCInst &Inst, unsigned Val, uint64_t Address,
-                                   const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2AddrModeImm8s4(MCInst &Inst, unsigned Val,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2AddrModeImm7s4(MCInst &Inst, unsigned Val,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2AddrModeImm0_1020s4(MCInst &Inst, unsigned Val,
-                                                uint64_t Address,
-                                                const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2Imm8(MCInst &Inst, unsigned Val, uint64_t Address,
-                                 const MCDisassembler *Decoder);
-template <int shift>
-static DecodeStatus DecodeT2Imm7(MCInst &Inst, unsigned Val, uint64_t Address,
-                                 const MCDisassembler *Decoder);
 static DecodeStatus DecodeT2AddrModeImm8(MCInst &Inst, unsigned Val,
                                          uint64_t Address,
                                          const MCDisassembler *Decoder);
-template <int shift>
-static DecodeStatus DecodeTAddrModeImm7(MCInst &Inst, unsigned Val,
-                                        uint64_t Address,
-                                        const MCDisassembler *Decoder);
-template <int shift, int WriteBack>
-static DecodeStatus DecodeT2AddrModeImm7(MCInst &Inst, unsigned Val,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder);
-static DecodeStatus DecodeThumbAddSPImm(MCInst &Inst, uint16_t Val,
-                                        uint64_t Address,
-                                        const MCDisassembler *Decoder);
-static DecodeStatus DecodeThumbAddSPReg(MCInst &Inst, uint16_t Insn,
-                                        uint64_t Address,
-                                        const MCDisassembler *Decoder);
-static DecodeStatus DecodeThumbCPS(MCInst &Inst, uint16_t Insn,
-                                   uint64_t Address,
-                                   const MCDisassembler *Decoder);
-static DecodeStatus DecodeQADDInstruction(MCInst &Inst, unsigned Insn,
-                                          uint64_t Address,
-                                          const MCDisassembler *Decoder);
-static DecodeStatus DecodeThumbBLXOffset(MCInst &Inst, unsigned Insn,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2AddrModeImm12(MCInst &Inst, unsigned Val,
-                                          uint64_t Address,
-                                          const MCDisassembler *Decoder);
-static DecodeStatus DecodeThumbTableBranch(MCInst &Inst, unsigned Val,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder);
-static DecodeStatus DecodeThumb2BCCInstruction(MCInst &Inst, unsigned Val,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2SOImm(MCInst &Inst, unsigned Val, uint64_t Address,
-                                  const MCDisassembler *Decoder);
-static DecodeStatus DecodeThumbBCCTargetOperand(MCInst &Inst, unsigned Val,
-                                                uint64_t Address,
-                                                const MCDisassembler *Decoder);
-static DecodeStatus DecodeThumbBLTargetOperand(MCInst &Inst, unsigned Val,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder);
-static DecodeStatus DecodeIT(MCInst &Inst, unsigned Val, uint64_t Address,
-                             const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2LDRDPreInstruction(MCInst &Inst, unsigned Insn,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2STRDPreInstruction(MCInst &Inst, unsigned Insn,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2Adr(MCInst &Inst, unsigned Val, uint64_t Address,
-                                const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2LdStPre(MCInst &Inst, unsigned Val,
-                                    uint64_t Address,
-                                    const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2ShifterImmOperand(MCInst &Inst, unsigned Val,
-                                              uint64_t Address,
-                                              const MCDisassembler *Decoder);
 
-static DecodeStatus DecodeLDR(MCInst &Inst, unsigned Val, uint64_t Address,
-                              const MCDisassembler *Decoder);
-static DecodeStatus DecoderForMRRC2AndMCRR2(MCInst &Inst, unsigned Val,
-                                            uint64_t Address,
-                                            const MCDisassembler *Decoder);
-static DecodeStatus DecodeForVMRSandVMSR(MCInst &Inst, unsigned Val,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder);
-
-template <bool isSigned, bool isNeg, bool zeroPermitted, int size>
-static DecodeStatus DecodeBFLabelOperand(MCInst &Inst, unsigned val,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder);
-static DecodeStatus DecodeBFAfterTargetOperand(MCInst &Inst, unsigned val,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder);
-static DecodeStatus DecodePredNoALOperand(MCInst &Inst, unsigned Val,
-                                          uint64_t Address,
-                                          const MCDisassembler *Decoder);
-static DecodeStatus DecodeLOLoop(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                 const MCDisassembler *Decoder);
-static DecodeStatus DecodeLongShiftOperand(MCInst &Inst, unsigned Val,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder);
-static DecodeStatus DecodeVSCCLRM(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                  const MCDisassembler *Decoder);
-static DecodeStatus DecodeVPTMaskOperand(MCInst &Inst, unsigned Val,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder);
-static DecodeStatus DecodeVpredROperand(MCInst &Inst, unsigned Val,
-                                        uint64_t Address,
-                                        const MCDisassembler *Decoder);
-static DecodeStatus DecodeVpredNOperand(MCInst &Inst, unsigned Val,
-                                        uint64_t Address,
-                                        const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeRestrictedIPredicateOperand(MCInst &Inst, unsigned Val, uint64_t Address,
-                                  const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeRestrictedSPredicateOperand(MCInst &Inst, unsigned Val, uint64_t Address,
-                                  const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeRestrictedUPredicateOperand(MCInst &Inst, unsigned Val, uint64_t Address,
-                                  const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeRestrictedFPPredicateOperand(MCInst &Inst, unsigned Val, uint64_t Address,
-                                   const MCDisassembler *Decoder);
-template <bool Writeback>
-static DecodeStatus DecodeVSTRVLDR_SYSREG(MCInst &Inst, unsigned Insn,
-                                          uint64_t Address,
-                                          const MCDisassembler *Decoder);
-template <int shift>
-static DecodeStatus DecodeMVE_MEM_1_pre(MCInst &Inst, unsigned Val,
-                                        uint64_t Address,
-                                        const MCDisassembler *Decoder);
-template <int shift>
-static DecodeStatus DecodeMVE_MEM_2_pre(MCInst &Inst, unsigned Val,
-                                        uint64_t Address,
-                                        const MCDisassembler *Decoder);
-template <int shift>
-static DecodeStatus DecodeMVE_MEM_3_pre(MCInst &Inst, unsigned Val,
-                                        uint64_t Address,
-                                        const MCDisassembler *Decoder);
-template <unsigned MinLog, unsigned MaxLog>
-static DecodeStatus DecodePowerTwoOperand(MCInst &Inst, unsigned Val,
-                                          uint64_t Address,
-                                          const MCDisassembler *Decoder);
-template <unsigned start>
-static DecodeStatus
-DecodeMVEPairVectorIndexOperand(MCInst &Inst, unsigned Val, uint64_t Address,
-                                const MCDisassembler *Decoder);
-static DecodeStatus DecodeMVEVMOVQtoDReg(MCInst &Inst, unsigned Insn,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder);
-static DecodeStatus DecodeMVEVMOVDRegtoQ(MCInst &Inst, unsigned Insn,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder);
-static DecodeStatus DecodeMVEVCVTt1fp(MCInst &Inst, unsigned Insn,
-                                      uint64_t Address,
-                                      const MCDisassembler *Decoder);
 typedef DecodeStatus OperandDecoder(MCInst &Inst, unsigned Val,
                                     uint64_t Address,
                                     const MCDisassembler *Decoder);
-template <bool scalar, OperandDecoder predicate_decoder>
-static DecodeStatus DecodeMVEVCMP(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                  const MCDisassembler *Decoder);
-static DecodeStatus DecodeMveVCTP(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                  const MCDisassembler *Decoder);
-static DecodeStatus DecodeMVEVPNOT(MCInst &Inst, unsigned Insn,
-                                   uint64_t Address,
-                                   const MCDisassembler *Decoder);
-static DecodeStatus
-DecodeMVEOverlappingLongShift(MCInst &Inst, unsigned Insn, uint64_t Address,
-                              const MCDisassembler *Decoder);
-static DecodeStatus DecodeT2AddSubSPImm(MCInst &Inst, unsigned Insn,
-                                        uint64_t Address,
-                                        const MCDisassembler *Decoder);
-static DecodeStatus DecodeLazyLoadStoreMul(MCInst &Inst, unsigned Insn,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder);
-
-#include "ARMGenDisassemblerTables.inc"
-
-static MCDisassembler *createARMDisassembler(const Target &T,
-                                             const MCSubtargetInfo &STI,
-                                             MCContext &Ctx) {
-  return new ARMDisassembler(STI, Ctx, T.createMCInstrInfo());
-}
-
-// Post-decoding checks
-static DecodeStatus checkDecodedInstruction(MCInst &MI, uint64_t &Size,
-                                            uint64_t Address, raw_ostream &CS,
-                                            uint32_t Insn,
-                                            DecodeStatus Result) {
-  switch (MI.getOpcode()) {
-    case ARM::HVC: {
-      // HVC is undefined if condition = 0xf otherwise upredictable
-      // if condition != 0xe
-      uint32_t Cond = (Insn >> 28) & 0xF;
-      if (Cond == 0xF)
-        return MCDisassembler::Fail;
-      if (Cond != 0xE)
-        return MCDisassembler::SoftFail;
-      return Result;
-    }
-    case ARM::t2ADDri:
-    case ARM::t2ADDri12:
-    case ARM::t2ADDrr:
-    case ARM::t2ADDrs:
-    case ARM::t2SUBri:
-    case ARM::t2SUBri12:
-    case ARM::t2SUBrr:
-    case ARM::t2SUBrs:
-      if (MI.getOperand(0).getReg() == ARM::SP &&
-          MI.getOperand(1).getReg() != ARM::SP)
-        return MCDisassembler::SoftFail;
-      return Result;
-    default: return Result;
-  }
-}
-
-uint64_t ARMDisassembler::suggestBytesToSkip(ArrayRef<uint8_t> Bytes,
-                                             uint64_t Address) const {
-  // In Arm state, instructions are always 4 bytes wide, so there's no
-  // point in skipping any smaller number of bytes if an instruction
-  // can't be decoded.
-  if (!STI.hasFeature(ARM::ModeThumb))
-    return 4;
-
-  // In a Thumb instruction stream, a halfword is a standalone 2-byte
-  // instruction if and only if its value is less than 0xE800.
-  // Otherwise, it's the first halfword of a 4-byte instruction.
-  //
-  // So, if we can see the upcoming halfword, we can judge on that
-  // basis, and maybe skip a whole 4-byte instruction that we don't
-  // know how to decode, without accidentally trying to interpret its
-  // second half as something else.
-  //
-  // If we don't have the instruction data available, we just have to
-  // recommend skipping the minimum sensible distance, which is 2
-  // bytes.
-  if (Bytes.size() < 2)
-    return 2;
-
-  uint16_t Insn16 = llvm::support::endian::read<uint16_t>(
-      Bytes.data(), InstructionEndianness);
-  return Insn16 < 0xE800 ? 2 : 4;
-}
-
-DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
-                                             ArrayRef<uint8_t> Bytes,
-                                             uint64_t Address,
-                                             raw_ostream &CS) const {
-  if (STI.hasFeature(ARM::ModeThumb))
-    return getThumbInstruction(MI, Size, Bytes, Address, CS);
-  return getARMInstruction(MI, Size, Bytes, Address, CS);
-}
-
-DecodeStatus ARMDisassembler::getARMInstruction(MCInst &MI, uint64_t &Size,
-                                                ArrayRef<uint8_t> Bytes,
-                                                uint64_t Address,
-                                                raw_ostream &CS) const {
-  CommentStream = &CS;
-
-  assert(!STI.hasFeature(ARM::ModeThumb) &&
-         "Asked to disassemble an ARM instruction but Subtarget is in Thumb "
-         "mode!");
-
-  // We want to read exactly 4 bytes of data.
-  if (Bytes.size() < 4) {
-    Size = 0;
-    return MCDisassembler::Fail;
-  }
-
-  // Encoded as a 32-bit word in the stream.
-  uint32_t Insn = llvm::support::endian::read<uint32_t>(Bytes.data(),
-                                                        InstructionEndianness);
-
-  // Calling the auto-generated decoder function.
-  DecodeStatus Result =
-      decodeInstruction(DecoderTableARM32, MI, Insn, Address, this, STI);
-  if (Result != MCDisassembler::Fail) {
-    Size = 4;
-    return checkDecodedInstruction(MI, Size, Address, CS, Insn, Result);
-  }
-
-  struct DecodeTable {
-    const uint8_t *P;
-    bool DecodePred;
-  };
-
-  const DecodeTable Tables[] = {
-      {DecoderTableVFP32, false},      {DecoderTableVFPV832, false},
-      {DecoderTableNEONData32, true},  {DecoderTableNEONLoadStore32, true},
-      {DecoderTableNEONDup32, true},   {DecoderTablev8NEON32, false},
-      {DecoderTablev8Crypto32, false},
-  };
-
-  for (auto Table : Tables) {
-    Result = decodeInstruction(Table.P, MI, Insn, Address, this, STI);
-    if (Result != MCDisassembler::Fail) {
-      Size = 4;
-      // Add a fake predicate operand, because we share these instruction
-      // definitions with Thumb2 where these instructions are predicable.
-      if (Table.DecodePred && !DecodePredicateOperand(MI, 0xE, Address, this))
-        return MCDisassembler::Fail;
-      return Result;
-    }
-  }
-
-  Result =
-      decodeInstruction(DecoderTableCoProc32, MI, Insn, Address, this, STI);
-  if (Result != MCDisassembler::Fail) {
-    Size = 4;
-    return checkDecodedInstruction(MI, Size, Address, CS, Insn, Result);
-  }
-
-  Size = 4;
-  return MCDisassembler::Fail;
-}
 
 /// tryAddingSymbolicOperand - trys to add a symbolic operand in place of the
 /// immediate Value in the MCInst.  The immediate Value has had any PC
@@ -868,409 +205,7 @@ static void tryAddingPcLoadReferenceComment(uint64_t Address, int Value,
   Decoder->tryAddingPcLoadReferenceComment(Value, Address);
 }
 
-// Thumb1 instructions don't have explicit S bits.  Rather, they
-// implicitly set CPSR.  Since it's not represented in the encoding, the
-// auto-generated decoder won't inject the CPSR operand.  We need to fix
-// that as a post-pass.
-void ARMDisassembler::AddThumb1SBit(MCInst &MI, bool InITBlock) const {
-  const MCInstrDesc &MCID = MCII->get(MI.getOpcode());
-  MCInst::iterator I = MI.begin();
-  for (unsigned i = 0; i < MCID.NumOperands; ++i, ++I) {
-    if (I == MI.end()) break;
-    if (MCID.operands()[i].isOptionalDef() &&
-        MCID.operands()[i].RegClass == ARM::CCRRegClassID) {
-      if (i > 0 && MCID.operands()[i - 1].isPredicate())
-        continue;
-      MI.insert(I,
-                MCOperand::createReg(InITBlock ? ARM::NoRegister : ARM::CPSR));
-      return;
-    }
-  }
-
-  MI.insert(I, MCOperand::createReg(InITBlock ? ARM::NoRegister : ARM::CPSR));
-}
-
-bool ARMDisassembler::isVectorPredicable(const MCInst &MI) const {
-  const MCInstrDesc &MCID = MCII->get(MI.getOpcode());
-  for (unsigned i = 0; i < MCID.NumOperands; ++i) {
-    if (ARM::isVpred(MCID.operands()[i].OperandType))
-      return true;
-  }
-  return false;
-}
-
-// Most Thumb instructions don't have explicit predicates in the
-// encoding, but rather get their predicates from IT context.  We need
-// to fix up the predicate operands using this context information as a
-// post-pass.
-MCDisassembler::DecodeStatus
-ARMDisassembler::AddThumbPredicate(MCInst &MI) const {
-  MCDisassembler::DecodeStatus S = Success;
-
-  const FeatureBitset &FeatureBits = getSubtargetInfo().getFeatureBits();
-
-  // A few instructions actually have predicates encoded in them.  Don't
-  // try to overwrite it if we're seeing one of those.
-  switch (MI.getOpcode()) {
-    case ARM::tBcc:
-    case ARM::t2Bcc:
-    case ARM::tCBZ:
-    case ARM::tCBNZ:
-    case ARM::tCPS:
-    case ARM::t2CPS3p:
-    case ARM::t2CPS2p:
-    case ARM::t2CPS1p:
-    case ARM::t2CSEL:
-    case ARM::t2CSINC:
-    case ARM::t2CSINV:
-    case ARM::t2CSNEG:
-    case ARM::tMOVSr:
-    case ARM::tSETEND:
-      // Some instructions (mostly conditional branches) are not
-      // allowed in IT blocks.
-      if (ITBlock.instrInITBlock())
-        S = SoftFail;
-      else
-        return Success;
-      break;
-    case ARM::t2HINT:
-      if (MI.getOperand(0).getImm() == 0x10 && (FeatureBits[ARM::FeatureRAS]) != 0)
-        S = SoftFail;
-      break;
-    case ARM::tB:
-    case ARM::t2B:
-    case ARM::t2TBB:
-    case ARM::t2TBH:
-      // Some instructions (mostly unconditional branches) can
-      // only appears at the end of, or outside of, an IT.
-      if (ITBlock.instrInITBlock() && !ITBlock.instrLastInITBlock())
-        S = SoftFail;
-      break;
-    default:
-      break;
-  }
-
-  // Warn on non-VPT predicable instruction in a VPT block and a VPT
-  // predicable instruction in an IT block
-  if ((!isVectorPredicable(MI) && VPTBlock.instrInVPTBlock()) ||
-      (isVectorPredicable(MI) && ITBlock.instrInITBlock()))
-    S = SoftFail;
-
-  // If we're in an IT/VPT block, base the predicate on that.  Otherwise,
-  // assume a predicate of AL.
-  unsigned CC = ARMCC::AL;
-  unsigned VCC = ARMVCC::None;
-  if (ITBlock.instrInITBlock()) {
-    CC = ITBlock.getITCC();
-    ITBlock.advanceITState();
-  } else if (VPTBlock.instrInVPTBlock()) {
-    VCC = VPTBlock.getVPTPred();
-    VPTBlock.advanceVPTState();
-  }
-
-  const MCInstrDesc &MCID = MCII->get(MI.getOpcode());
-
-  MCInst::iterator CCI = MI.begin();
-  for (unsigned i = 0; i < MCID.NumOperands; ++i, ++CCI) {
-    if (MCID.operands()[i].isPredicate() || CCI == MI.end())
-      break;
-  }
-
-  if (MCID.isPredicable()) {
-    CCI = MI.insert(CCI, MCOperand::createImm(CC));
-    ++CCI;
-    if (CC == ARMCC::AL)
-      MI.insert(CCI, MCOperand::createReg(ARM::NoRegister));
-    else
-      MI.insert(CCI, MCOperand::createReg(ARM::CPSR));
-  } else if (CC != ARMCC::AL) {
-    Check(S, SoftFail);
-  }
-
-  MCInst::iterator VCCI = MI.begin();
-  unsigned VCCPos;
-  for (VCCPos = 0; VCCPos < MCID.NumOperands; ++VCCPos, ++VCCI) {
-    if (ARM::isVpred(MCID.operands()[VCCPos].OperandType) || VCCI == MI.end())
-      break;
-  }
-
-  if (isVectorPredicable(MI)) {
-    VCCI = MI.insert(VCCI, MCOperand::createImm(VCC));
-    ++VCCI;
-    if (VCC == ARMVCC::None)
-      VCCI = MI.insert(VCCI, MCOperand::createReg(0));
-    else
-      VCCI = MI.insert(VCCI, MCOperand::createReg(ARM::P0));
-    ++VCCI;
-    VCCI = MI.insert(VCCI, MCOperand::createReg(0));
-    ++VCCI;
-    if (MCID.operands()[VCCPos].OperandType == ARM::OPERAND_VPRED_R) {
-      int TiedOp = MCID.getOperandConstraint(VCCPos + 3, MCOI::TIED_TO);
-      assert(TiedOp >= 0 &&
-             "Inactive register in vpred_r is not tied to an output!");
-      // Copy the operand to ensure it's not invalidated when MI grows.
-      MI.insert(VCCI, MCOperand(MI.getOperand(TiedOp)));
-    }
-  } else if (VCC != ARMVCC::None) {
-    Check(S, SoftFail);
-  }
-
-  return S;
-}
-
-// Thumb VFP instructions are a special case.  Because we share their
-// encodings between ARM and Thumb modes, and they are predicable in ARM
-// mode, the auto-generated decoder will give them an (incorrect)
-// predicate operand.  We need to rewrite these operands based on the IT
-// context as a post-pass.
-void ARMDisassembler::UpdateThumbVFPPredicate(
-  DecodeStatus &S, MCInst &MI) const {
-  unsigned CC;
-  CC = ITBlock.getITCC();
-  if (CC == 0xF)
-    CC = ARMCC::AL;
-  if (ITBlock.instrInITBlock())
-    ITBlock.advanceITState();
-  else if (VPTBlock.instrInVPTBlock()) {
-    CC = VPTBlock.getVPTPred();
-    VPTBlock.advanceVPTState();
-  }
-
-  const MCInstrDesc &MCID = MCII->get(MI.getOpcode());
-  ArrayRef<MCOperandInfo> OpInfo = MCID.operands();
-  MCInst::iterator I = MI.begin();
-  unsigned short NumOps = MCID.NumOperands;
-  for (unsigned i = 0; i < NumOps; ++i, ++I) {
-    if (OpInfo[i].isPredicate() ) {
-      if (CC != ARMCC::AL && !MCID.isPredicable())
-        Check(S, SoftFail);
-      I->setImm(CC);
-      ++I;
-      if (CC == ARMCC::AL)
-        I->setReg(ARM::NoRegister);
-      else
-        I->setReg(ARM::CPSR);
-      return;
-    }
-  }
-}
-
-DecodeStatus ARMDisassembler::getThumbInstruction(MCInst &MI, uint64_t &Size,
-                                                  ArrayRef<uint8_t> Bytes,
-                                                  uint64_t Address,
-                                                  raw_ostream &CS) const {
-  CommentStream = &CS;
-
-  assert(STI.hasFeature(ARM::ModeThumb) &&
-         "Asked to disassemble in Thumb mode but Subtarget is in ARM mode!");
-
-  // We want to read exactly 2 bytes of data.
-  if (Bytes.size() < 2) {
-    Size = 0;
-    return MCDisassembler::Fail;
-  }
-
-  uint16_t Insn16 = llvm::support::endian::read<uint16_t>(
-      Bytes.data(), InstructionEndianness);
-  DecodeStatus Result =
-      decodeInstruction(DecoderTableThumb16, MI, Insn16, Address, this, STI);
-  if (Result != MCDisassembler::Fail) {
-    Size = 2;
-    Check(Result, AddThumbPredicate(MI));
-    return Result;
-  }
-
-  Result = decodeInstruction(DecoderTableThumbSBit16, MI, Insn16, Address, this,
-                             STI);
-  if (Result) {
-    Size = 2;
-    bool InITBlock = ITBlock.instrInITBlock();
-    Check(Result, AddThumbPredicate(MI));
-    AddThumb1SBit(MI, InITBlock);
-    return Result;
-  }
-
-  Result =
-      decodeInstruction(DecoderTableThumb216, MI, Insn16, Address, this, STI);
-  if (Result != MCDisassembler::Fail) {
-    Size = 2;
-
-    // Nested IT blocks are UNPREDICTABLE.  Must be checked before we add
-    // the Thumb predicate.
-    if (MI.getOpcode() == ARM::t2IT && ITBlock.instrInITBlock())
-      Result = MCDisassembler::SoftFail;
-
-    Check(Result, AddThumbPredicate(MI));
-
-    // If we find an IT instruction, we need to parse its condition
-    // code and mask operands so that we can apply them correctly
-    // to the subsequent instructions.
-    if (MI.getOpcode() == ARM::t2IT) {
-      unsigned Firstcond = MI.getOperand(0).getImm();
-      unsigned Mask = MI.getOperand(1).getImm();
-      ITBlock.setITState(Firstcond, Mask);
-
-      // An IT instruction that would give a 'NV' predicate is unpredictable.
-      if (Firstcond == ARMCC::AL && !isPowerOf2_32(Mask))
-        CS << "unpredictable IT predicate sequence";
-    }
-
-    return Result;
-  }
-
-  // We want to read exactly 4 bytes of data.
-  if (Bytes.size() < 4) {
-    Size = 0;
-    return MCDisassembler::Fail;
-  }
-
-  uint32_t Insn32 =
-      (uint32_t(Insn16) << 16) | llvm::support::endian::read<uint16_t>(
-                                     Bytes.data() + 2, InstructionEndianness);
-
-  Result =
-      decodeInstruction(DecoderTableMVE32, MI, Insn32, Address, this, STI);
-  if (Result != MCDisassembler::Fail) {
-    Size = 4;
-
-    // Nested VPT blocks are UNPREDICTABLE. Must be checked before we add
-    // the VPT predicate.
-    if (isVPTOpcode(MI.getOpcode()) && VPTBlock.instrInVPTBlock())
-      Result = MCDisassembler::SoftFail;
-
-    Check(Result, AddThumbPredicate(MI));
-
-    if (isVPTOpcode(MI.getOpcode())) {
-      unsigned Mask = MI.getOperand(0).getImm();
-      VPTBlock.setVPTState(Mask);
-    }
-
-    return Result;
-  }
-
-  Result =
-      decodeInstruction(DecoderTableThumb32, MI, Insn32, Address, this, STI);
-  if (Result != MCDisassembler::Fail) {
-    Size = 4;
-    bool InITBlock = ITBlock.instrInITBlock();
-    Check(Result, AddThumbPredicate(MI));
-    AddThumb1SBit(MI, InITBlock);
-    return Result;
-  }
-
-  Result =
-      decodeInstruction(DecoderTableThumb232, MI, Insn32, Address, this, STI);
-  if (Result != MCDisassembler::Fail) {
-    Size = 4;
-    Check(Result, AddThumbPredicate(MI));
-    return checkDecodedInstruction(MI, Size, Address, CS, Insn32, Result);
-  }
-
-  if (fieldFromInstruction(Insn32, 28, 4) == 0xE) {
-    Result =
-        decodeInstruction(DecoderTableVFP32, MI, Insn32, Address, this, STI);
-    if (Result != MCDisassembler::Fail) {
-      Size = 4;
-      UpdateThumbVFPPredicate(Result, MI);
-      return Result;
-    }
-  }
-
-  Result =
-      decodeInstruction(DecoderTableVFPV832, MI, Insn32, Address, this, STI);
-  if (Result != MCDisassembler::Fail) {
-    Size = 4;
-    return Result;
-  }
-
-  if (fieldFromInstruction(Insn32, 28, 4) == 0xE) {
-    Result = decodeInstruction(DecoderTableNEONDup32, MI, Insn32, Address, this,
-                               STI);
-    if (Result != MCDisassembler::Fail) {
-      Size = 4;
-      Check(Result, AddThumbPredicate(MI));
-      return Result;
-    }
-  }
-
-  if (fieldFromInstruction(Insn32, 24, 8) == 0xF9) {
-    uint32_t NEONLdStInsn = Insn32;
-    NEONLdStInsn &= 0xF0FFFFFF;
-    NEONLdStInsn |= 0x04000000;
-    Result = decodeInstruction(DecoderTableNEONLoadStore32, MI, NEONLdStInsn,
-                               Address, this, STI);
-    if (Result != MCDisassembler::Fail) {
-      Size = 4;
-      Check(Result, AddThumbPredicate(MI));
-      return Result;
-    }
-  }
-
-  if (fieldFromInstruction(Insn32, 24, 4) == 0xF) {
-    uint32_t NEONDataInsn = Insn32;
-    NEONDataInsn &= 0xF0FFFFFF; // Clear bits 27-24
-    NEONDataInsn |= (NEONDataInsn & 0x10000000) >> 4; // Move bit 28 to bit 24
-    NEONDataInsn |= 0x12000000; // Set bits 28 and 25
-    Result = decodeInstruction(DecoderTableNEONData32, MI, NEONDataInsn,
-                               Address, this, STI);
-    if (Result != MCDisassembler::Fail) {
-      Size = 4;
-      Check(Result, AddThumbPredicate(MI));
-      return Result;
-    }
-
-    uint32_t NEONCryptoInsn = Insn32;
-    NEONCryptoInsn &= 0xF0FFFFFF; // Clear bits 27-24
-    NEONCryptoInsn |= (NEONCryptoInsn & 0x10000000) >> 4; // Move bit 28 to bit 24
-    NEONCryptoInsn |= 0x12000000; // Set bits 28 and 25
-    Result = decodeInstruction(DecoderTablev8Crypto32, MI, NEONCryptoInsn,
-                               Address, this, STI);
-    if (Result != MCDisassembler::Fail) {
-      Size = 4;
-      return Result;
-    }
-
-    uint32_t NEONv8Insn = Insn32;
-    NEONv8Insn &= 0xF3FFFFFF; // Clear bits 27-26
-    Result = decodeInstruction(DecoderTablev8NEON32, MI, NEONv8Insn, Address,
-                               this, STI);
-    if (Result != MCDisassembler::Fail) {
-      Size = 4;
-      return Result;
-    }
-  }
-
-  uint32_t Coproc = fieldFromInstruction(Insn32, 8, 4);
-  const uint8_t *DecoderTable = ARM::isCDECoproc(Coproc, STI)
-                                    ? DecoderTableThumb2CDE32
-                                    : DecoderTableThumb2CoProc32;
-  Result =
-      decodeInstruction(DecoderTable, MI, Insn32, Address, this, STI);
-  if (Result != MCDisassembler::Fail) {
-    Size = 4;
-    Check(Result, AddThumbPredicate(MI));
-    return Result;
-  }
-
-  // Advance IT state to prevent next instruction inheriting
-  // the wrong IT state.
-  if (ITBlock.instrInITBlock())
-    ITBlock.advanceITState();
-  Size = 0;
-  return MCDisassembler::Fail;
-}
-
-extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
-LLVMInitializeARMDisassembler() {
-  TargetRegistry::RegisterMCDisassembler(getTheARMLETarget(),
-                                         createARMDisassembler);
-  TargetRegistry::RegisterMCDisassembler(getTheARMBETarget(),
-                                         createARMDisassembler);
-  TargetRegistry::RegisterMCDisassembler(getTheThumbLETarget(),
-                                         createARMDisassembler);
-  TargetRegistry::RegisterMCDisassembler(getTheThumbBETarget(),
-                                         createARMDisassembler);
-}
+// Register class decoding functions.
 
 static const uint16_t GPRDecoderTable[] = {
   ARM::R0, ARM::R1, ARM::R2, ARM::R3,
@@ -1626,6 +561,51 @@ DecodeDPairSpacedRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeMQPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder) {
+  if (RegNo > 7)
+    return MCDisassembler::Fail;
+
+  unsigned Register = QPRDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
+  return MCDisassembler::Success;
+}
+
+static const MCPhysReg QQPRDecoderTable[] = {
+     ARM::Q0_Q1,  ARM::Q1_Q2,  ARM::Q2_Q3,  ARM::Q3_Q4,
+     ARM::Q4_Q5,  ARM::Q5_Q6,  ARM::Q6_Q7
+};
+
+static DecodeStatus DecodeMQQPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder) {
+  if (RegNo > 6)
+    return MCDisassembler::Fail;
+
+  unsigned Register = QQPRDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
+  return MCDisassembler::Success;
+}
+
+static const MCPhysReg QQQQPRDecoderTable[] = {
+     ARM::Q0_Q1_Q2_Q3,  ARM::Q1_Q2_Q3_Q4,  ARM::Q2_Q3_Q4_Q5,
+     ARM::Q3_Q4_Q5_Q6,  ARM::Q4_Q5_Q6_Q7
+};
+
+static DecodeStatus DecodeMQQQQPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder) {
+  if (RegNo > 4)
+    return MCDisassembler::Fail;
+
+  unsigned Register = QQQQPRDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
+  return MCDisassembler::Success;
+}
+
+// Operand decoding functions.
+
 static DecodeStatus DecodePredicateOperand(MCInst &Inst, unsigned Val,
                                            uint64_t Address,
                                            const MCDisassembler *Decoder) {
@@ -2422,6 +1402,54 @@ static DecodeStatus DecodeRFEInstruction(MCInst &Inst, unsigned Insn,
   return S;
 }
 
+static DecodeStatus DecodeCPSInstruction(MCInst &Inst, unsigned Insn,
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder) {
+  unsigned imod = fieldFromInstruction(Insn, 18, 2);
+  unsigned M = fieldFromInstruction(Insn, 17, 1);
+  unsigned iflags = fieldFromInstruction(Insn, 6, 3);
+  unsigned mode = fieldFromInstruction(Insn, 0, 5);
+
+  DecodeStatus S = MCDisassembler::Success;
+
+  // This decoder is called from multiple location that do not check
+  // the full encoding is valid before they do.
+  if (fieldFromInstruction(Insn, 5, 1) != 0 ||
+      fieldFromInstruction(Insn, 16, 1) != 0 ||
+      fieldFromInstruction(Insn, 20, 8) != 0x10)
+    return MCDisassembler::Fail;
+
+  // imod == '01' --> UNPREDICTABLE
+  // NOTE: Even though this is technically UNPREDICTABLE, we choose to
+  // return failure here.  The '01' imod value is unprintable, so there's
+  // nothing useful we could do even if we returned UNPREDICTABLE.
+
+  if (imod == 1) return MCDisassembler::Fail;
+
+  if (imod && M) {
+    Inst.setOpcode(ARM::CPS3p);
+    Inst.addOperand(MCOperand::createImm(imod));
+    Inst.addOperand(MCOperand::createImm(iflags));
+    Inst.addOperand(MCOperand::createImm(mode));
+  } else if (imod && !M) {
+    Inst.setOpcode(ARM::CPS2p);
+    Inst.addOperand(MCOperand::createImm(imod));
+    Inst.addOperand(MCOperand::createImm(iflags));
+    if (mode) S = MCDisassembler::SoftFail;
+  } else if (!imod && M) {
+    Inst.setOpcode(ARM::CPS1p);
+    Inst.addOperand(MCOperand::createImm(mode));
+    if (iflags) S = MCDisassembler::SoftFail;
+  } else {
+    // imod == '00' && M == '0' --> UNPREDICTABLE
+    Inst.setOpcode(ARM::CPS1p);
+    Inst.addOperand(MCOperand::createImm(mode));
+    S = MCDisassembler::SoftFail;
+  }
+
+  return S;
+}
+
 static DecodeStatus DecodeQADDInstruction(MCInst &Inst, unsigned Insn,
                                           uint64_t Address,
                                           const MCDisassembler *Decoder) {
@@ -2562,54 +1590,6 @@ static DecodeStatus DecodeHINTInstruction(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeCPSInstruction(MCInst &Inst, unsigned Insn,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder) {
-  unsigned imod = fieldFromInstruction(Insn, 18, 2);
-  unsigned M = fieldFromInstruction(Insn, 17, 1);
-  unsigned iflags = fieldFromInstruction(Insn, 6, 3);
-  unsigned mode = fieldFromInstruction(Insn, 0, 5);
-
-  DecodeStatus S = MCDisassembler::Success;
-
-  // This decoder is called from multiple location that do not check
-  // the full encoding is valid before they do.
-  if (fieldFromInstruction(Insn, 5, 1) != 0 ||
-      fieldFromInstruction(Insn, 16, 1) != 0 ||
-      fieldFromInstruction(Insn, 20, 8) != 0x10)
-    return MCDisassembler::Fail;
-
-  // imod == '01' --> UNPREDICTABLE
-  // NOTE: Even though this is technically UNPREDICTABLE, we choose to
-  // return failure here.  The '01' imod value is unprintable, so there's
-  // nothing useful we could do even if we returned UNPREDICTABLE.
-
-  if (imod == 1) return MCDisassembler::Fail;
-
-  if (imod && M) {
-    Inst.setOpcode(ARM::CPS3p);
-    Inst.addOperand(MCOperand::createImm(imod));
-    Inst.addOperand(MCOperand::createImm(iflags));
-    Inst.addOperand(MCOperand::createImm(mode));
-  } else if (imod && !M) {
-    Inst.setOpcode(ARM::CPS2p);
-    Inst.addOperand(MCOperand::createImm(imod));
-    Inst.addOperand(MCOperand::createImm(iflags));
-    if (mode) S = MCDisassembler::SoftFail;
-  } else if (!imod && M) {
-    Inst.setOpcode(ARM::CPS1p);
-    Inst.addOperand(MCOperand::createImm(mode));
-    if (iflags) S = MCDisassembler::SoftFail;
-  } else {
-    // imod == '00' && M == '0' --> UNPREDICTABLE
-    Inst.setOpcode(ARM::CPS1p);
-    Inst.addOperand(MCOperand::createImm(mode));
-    S = MCDisassembler::SoftFail;
-  }
-
-  return S;
-}
-
 static DecodeStatus DecodeT2CPSInstruction(MCInst &Inst, unsigned Insn,
                                            uint64_t Address,
                                            const MCDisassembler *Decoder) {
@@ -2760,28 +1740,6 @@ static DecodeStatus DecodeSMLAInstruction(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeTSTInstruction(MCInst &Inst, unsigned Insn,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder) {
-  DecodeStatus S = MCDisassembler::Success;
-
-  unsigned Pred = fieldFromInstruction(Insn, 28, 4);
-  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
-  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
-
-  if (Pred == 0xF)
-    return DecodeSETPANInstruction(Inst, Insn, Address, Decoder);
-
-  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
-    return MCDisassembler::Fail;
-  if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
-    return MCDisassembler::Fail;
-  if (!Check(S, DecodePredicateOperand(Inst, Pred, Address, Decoder)))
-    return MCDisassembler::Fail;
-
-  return S;
-}
-
 static DecodeStatus DecodeSETPANInstruction(MCInst &Inst, unsigned Insn,
                                             uint64_t Address,
                                             const MCDisassembler *Decoder) {
@@ -2811,6 +1769,28 @@ static DecodeStatus DecodeSETPANInstruction(MCInst &Inst, unsigned Insn,
   return S;
 }
 
+static DecodeStatus DecodeTSTInstruction(MCInst &Inst, unsigned Insn,
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Pred = fieldFromInstruction(Insn, 28, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+
+  if (Pred == 0xF)
+    return DecodeSETPANInstruction(Inst, Insn, Address, Decoder);
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, Pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
 static DecodeStatus DecodeAddrModeImm12Operand(MCInst &Inst, unsigned Val,
                                                uint64_t Address,
                                                const MCDisassembler *Decoder) {
@@ -3232,61 +2212,6 @@ static DecodeStatus DecodeVLDInstruction(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeVLDST1Instruction(MCInst &Inst, unsigned Insn,
-                                            uint64_t Address,
-                                            const MCDisassembler *Decoder) {
-  unsigned type = fieldFromInstruction(Insn, 8, 4);
-  unsigned align = fieldFromInstruction(Insn, 4, 2);
-  if (type == 6 && (align & 2)) return MCDisassembler::Fail;
-  if (type == 7 && (align & 2)) return MCDisassembler::Fail;
-  if (type == 10 && align == 3) return MCDisassembler::Fail;
-
-  unsigned load = fieldFromInstruction(Insn, 21, 1);
-  return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder)
-              : DecodeVSTInstruction(Inst, Insn, Address, Decoder);
-}
-
-static DecodeStatus DecodeVLDST2Instruction(MCInst &Inst, unsigned Insn,
-                                            uint64_t Address,
-                                            const MCDisassembler *Decoder) {
-  unsigned size = fieldFromInstruction(Insn, 6, 2);
-  if (size == 3) return MCDisassembler::Fail;
-
-  unsigned type = fieldFromInstruction(Insn, 8, 4);
-  unsigned align = fieldFromInstruction(Insn, 4, 2);
-  if (type == 8 && align == 3) return MCDisassembler::Fail;
-  if (type == 9 && align == 3) return MCDisassembler::Fail;
-
-  unsigned load = fieldFromInstruction(Insn, 21, 1);
-  return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder)
-              : DecodeVSTInstruction(Inst, Insn, Address, Decoder);
-}
-
-static DecodeStatus DecodeVLDST3Instruction(MCInst &Inst, unsigned Insn,
-                                            uint64_t Address,
-                                            const MCDisassembler *Decoder) {
-  unsigned size = fieldFromInstruction(Insn, 6, 2);
-  if (size == 3) return MCDisassembler::Fail;
-
-  unsigned align = fieldFromInstruction(Insn, 4, 2);
-  if (align & 2) return MCDisassembler::Fail;
-
-  unsigned load = fieldFromInstruction(Insn, 21, 1);
-  return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder)
-              : DecodeVSTInstruction(Inst, Insn, Address, Decoder);
-}
-
-static DecodeStatus DecodeVLDST4Instruction(MCInst &Inst, unsigned Insn,
-                                            uint64_t Address,
-                                            const MCDisassembler *Decoder) {
-  unsigned size = fieldFromInstruction(Insn, 6, 2);
-  if (size == 3) return MCDisassembler::Fail;
-
-  unsigned load = fieldFromInstruction(Insn, 21, 1);
-  return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder)
-              : DecodeVSTInstruction(Inst, Insn, Address, Decoder);
-}
-
 static DecodeStatus DecodeVSTInstruction(MCInst &Inst, unsigned Insn,
                                          uint64_t Address,
                                          const MCDisassembler *Decoder) {
@@ -3558,6 +2483,61 @@ static DecodeStatus DecodeVSTInstruction(MCInst &Inst, unsigned Insn,
   return S;
 }
 
+static DecodeStatus DecodeVLDST1Instruction(MCInst &Inst, unsigned Insn,
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder) {
+  unsigned type = fieldFromInstruction(Insn, 8, 4);
+  unsigned align = fieldFromInstruction(Insn, 4, 2);
+  if (type == 6 && (align & 2)) return MCDisassembler::Fail;
+  if (type == 7 && (align & 2)) return MCDisassembler::Fail;
+  if (type == 10 && align == 3) return MCDisassembler::Fail;
+
+  unsigned load = fieldFromInstruction(Insn, 21, 1);
+  return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder)
+              : DecodeVSTInstruction(Inst, Insn, Address, Decoder);
+}
+
+static DecodeStatus DecodeVLDST2Instruction(MCInst &Inst, unsigned Insn,
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder) {
+  unsigned size = fieldFromInstruction(Insn, 6, 2);
+  if (size == 3) return MCDisassembler::Fail;
+
+  unsigned type = fieldFromInstruction(Insn, 8, 4);
+  unsigned align = fieldFromInstruction(Insn, 4, 2);
+  if (type == 8 && align == 3) return MCDisassembler::Fail;
+  if (type == 9 && align == 3) return MCDisassembler::Fail;
+
+  unsigned load = fieldFromInstruction(Insn, 21, 1);
+  return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder)
+              : DecodeVSTInstruction(Inst, Insn, Address, Decoder);
+}
+
+static DecodeStatus DecodeVLDST3Instruction(MCInst &Inst, unsigned Insn,
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder) {
+  unsigned size = fieldFromInstruction(Insn, 6, 2);
+  if (size == 3) return MCDisassembler::Fail;
+
+  unsigned align = fieldFromInstruction(Insn, 4, 2);
+  if (align & 2) return MCDisassembler::Fail;
+
+  unsigned load = fieldFromInstruction(Insn, 21, 1);
+  return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder)
+              : DecodeVSTInstruction(Inst, Insn, Address, Decoder);
+}
+
+static DecodeStatus DecodeVLDST4Instruction(MCInst &Inst, unsigned Insn,
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder) {
+  unsigned size = fieldFromInstruction(Insn, 6, 2);
+  if (size == 3) return MCDisassembler::Fail;
+
+  unsigned load = fieldFromInstruction(Insn, 21, 1);
+  return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder)
+              : DecodeVSTInstruction(Inst, Insn, Address, Decoder);
+}
+
 static DecodeStatus DecodeVLD1DupInstruction(MCInst &Inst, unsigned Insn,
                                              uint64_t Address,
                                              const MCDisassembler *Decoder) {
@@ -4063,6 +3043,60 @@ static DecodeStatus DecodeT2AddrModeSOReg(MCInst &Inst, unsigned Val,
   return S;
 }
 
+static DecodeStatus DecodeT2LoadLabel(MCInst &Inst, unsigned Insn,
+                                      uint64_t Address,
+                                      const MCDisassembler *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned U = fieldFromInstruction(Insn, 23, 1);
+  int imm = fieldFromInstruction(Insn, 0, 12);
+
+  const FeatureBitset &featureBits =
+      Decoder->getSubtargetInfo().getFeatureBits();
+
+  bool hasV7Ops = featureBits[ARM::HasV7Ops];
+
+  if (Rt == 15) {
+    switch (Inst.getOpcode()) {
+      case ARM::t2LDRBpci:
+      case ARM::t2LDRHpci:
+        Inst.setOpcode(ARM::t2PLDpci);
+        break;
+      case ARM::t2LDRSBpci:
+        Inst.setOpcode(ARM::t2PLIpci);
+        break;
+      case ARM::t2LDRSHpci:
+        return MCDisassembler::Fail;
+      default:
+        break;
+    }
+  }
+
+  switch(Inst.getOpcode()) {
+  case ARM::t2PLDpci:
+    break;
+  case ARM::t2PLIpci:
+    if (!hasV7Ops)
+      return MCDisassembler::Fail;
+    break;
+  default:
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!U) {
+    // Special case for #-0.
+    if (imm == 0)
+      imm = INT32_MIN;
+    else
+      imm = -imm;
+  }
+  Inst.addOperand(MCOperand::createImm(imm));
+
+  return S;
+}
+
 static DecodeStatus DecodeT2LoadShift(MCInst &Inst, unsigned Insn,
                                       uint64_t Address,
                                       const MCDisassembler *Decoder) {
@@ -4232,6 +3266,33 @@ static DecodeStatus DecodeT2LoadImm8(MCInst &Inst, unsigned Insn,
   return S;
 }
 
+static DecodeStatus DecodeT2AddrModeImm12(MCInst &Inst, unsigned Val,
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Val, 13, 4);
+  unsigned imm = fieldFromInstruction(Val, 0, 12);
+
+  // Thumb stores cannot use PC as dest register.
+  switch (Inst.getOpcode()) {
+  case ARM::t2STRi12:
+  case ARM::t2STRBi12:
+  case ARM::t2STRHi12:
+    if (Rn == 15)
+      return MCDisassembler::Fail;
+    break;
+  default:
+    break;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(imm));
+
+  return S;
+}
+
 static DecodeStatus DecodeT2LoadImm12(MCInst &Inst, unsigned Insn,
                                       uint64_t Address,
                                       const MCDisassembler *Decoder) {
@@ -4352,60 +3413,6 @@ static DecodeStatus DecodeT2LoadT(MCInst &Inst, unsigned Insn, uint64_t Address,
   return S;
 }
 
-static DecodeStatus DecodeT2LoadLabel(MCInst &Inst, unsigned Insn,
-                                      uint64_t Address,
-                                      const MCDisassembler *Decoder) {
-  DecodeStatus S = MCDisassembler::Success;
-
-  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
-  unsigned U = fieldFromInstruction(Insn, 23, 1);
-  int imm = fieldFromInstruction(Insn, 0, 12);
-
-  const FeatureBitset &featureBits =
-      Decoder->getSubtargetInfo().getFeatureBits();
-
-  bool hasV7Ops = featureBits[ARM::HasV7Ops];
-
-  if (Rt == 15) {
-    switch (Inst.getOpcode()) {
-      case ARM::t2LDRBpci:
-      case ARM::t2LDRHpci:
-        Inst.setOpcode(ARM::t2PLDpci);
-        break;
-      case ARM::t2LDRSBpci:
-        Inst.setOpcode(ARM::t2PLIpci);
-        break;
-      case ARM::t2LDRSHpci:
-        return MCDisassembler::Fail;
-      default:
-        break;
-    }
-  }
-
-  switch(Inst.getOpcode()) {
-  case ARM::t2PLDpci:
-    break;
-  case ARM::t2PLIpci:
-    if (!hasV7Ops)
-      return MCDisassembler::Fail;
-    break;
-  default:
-    if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
-      return MCDisassembler::Fail;
-  }
-
-  if (!U) {
-    // Special case for #-0.
-    if (imm == 0)
-      imm = INT32_MIN;
-    else
-      imm = -imm;
-  }
-  Inst.addOperand(MCOperand::createImm(imm));
-
-  return S;
-}
-
 static DecodeStatus DecodeT2Imm8S4(MCInst &Inst, unsigned Val, uint64_t Address,
                                    const MCDisassembler *Decoder) {
   if (Val == 0)
@@ -4655,33 +3662,6 @@ static DecodeStatus DecodeT2LdStPre(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeT2AddrModeImm12(MCInst &Inst, unsigned Val,
-                                          uint64_t Address,
-                                          const MCDisassembler *Decoder) {
-  DecodeStatus S = MCDisassembler::Success;
-
-  unsigned Rn = fieldFromInstruction(Val, 13, 4);
-  unsigned imm = fieldFromInstruction(Val, 0, 12);
-
-  // Thumb stores cannot use PC as dest register.
-  switch (Inst.getOpcode()) {
-  case ARM::t2STRi12:
-  case ARM::t2STRBi12:
-  case ARM::t2STRHi12:
-    if (Rn == 15)
-      return MCDisassembler::Fail;
-    break;
-  default:
-    break;
-  }
-
-  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
-    return MCDisassembler::Fail;
-  Inst.addOperand(MCOperand::createImm(imm));
-
-  return S;
-}
-
 static DecodeStatus DecodeThumbAddSPImm(MCInst &Inst, uint16_t Insn,
                                         uint64_t Address,
                                         const MCDisassembler *Decoder) {
@@ -4844,6 +3824,16 @@ static DecodeStatus DecodeThumbTableBranch(MCInst &Inst, unsigned Insn,
   return S;
 }
 
+static DecodeStatus DecodeMemBarrierOption(MCInst &Inst, unsigned Val,
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder) {
+  if (Val & ~0xf)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::createImm(Val));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeThumb2BCCInstruction(MCInst &Inst, unsigned Insn,
                                                uint64_t Address,
                                                const MCDisassembler *Decoder) {
@@ -4951,16 +3941,6 @@ static DecodeStatus DecodeThumbBLTargetOperand(MCInst &Inst, unsigned Val,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeMemBarrierOption(MCInst &Inst, unsigned Val,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder) {
-  if (Val & ~0xf)
-    return MCDisassembler::Fail;
-
-  Inst.addOperand(MCOperand::createImm(Val));
-  return MCDisassembler::Success;
-}
-
 static DecodeStatus DecodeInstSyncBarrierOption(MCInst &Inst, unsigned Val,
                                                 uint64_t Address,
                                                 const MCDisassembler *Decoder) {
@@ -6475,49 +5455,6 @@ static DecodeStatus DecodeVSCCLRM(MCInst &Inst, unsigned Insn, uint64_t Address,
   return S;
 }
 
-static DecodeStatus DecodeMQPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                            uint64_t Address,
-                                            const MCDisassembler *Decoder) {
-  if (RegNo > 7)
-    return MCDisassembler::Fail;
-
-  unsigned Register = QPRDecoderTable[RegNo];
-  Inst.addOperand(MCOperand::createReg(Register));
-  return MCDisassembler::Success;
-}
-
-static const MCPhysReg QQPRDecoderTable[] = {
-     ARM::Q0_Q1,  ARM::Q1_Q2,  ARM::Q2_Q3,  ARM::Q3_Q4,
-     ARM::Q4_Q5,  ARM::Q5_Q6,  ARM::Q6_Q7
-};
-
-static DecodeStatus DecodeMQQPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const MCDisassembler *Decoder) {
-  if (RegNo > 6)
-    return MCDisassembler::Fail;
-
-  unsigned Register = QQPRDecoderTable[RegNo];
-  Inst.addOperand(MCOperand::createReg(Register));
-  return MCDisassembler::Success;
-}
-
-static const MCPhysReg QQQQPRDecoderTable[] = {
-     ARM::Q0_Q1_Q2_Q3,  ARM::Q1_Q2_Q3_Q4,  ARM::Q2_Q3_Q4_Q5,
-     ARM::Q3_Q4_Q5_Q6,  ARM::Q4_Q5_Q6_Q7
-};
-
-static DecodeStatus DecodeMQQQQPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder) {
-  if (RegNo > 4)
-    return MCDisassembler::Fail;
-
-  unsigned Register = QQQQPRDecoderTable[RegNo];
-  Inst.addOperand(MCOperand::createReg(Register));
-  return MCDisassembler::Success;
-}
-
 static DecodeStatus DecodeVPTMaskOperand(MCInst &Inst, unsigned Val,
                                          uint64_t Address,
                                          const MCDisassembler *Decoder) {
@@ -7069,3 +6006,547 @@ static DecodeStatus DecodeLazyLoadStoreMul(MCInst &Inst, unsigned Insn,
 
   return S;
 }
+
+#include "ARMGenDisassemblerTables.inc"
+
+// Post-decoding checks
+static DecodeStatus checkDecodedInstruction(MCInst &MI, uint64_t &Size,
+                                            uint64_t Address, raw_ostream &CS,
+                                            uint32_t Insn,
+                                            DecodeStatus Result) {
+  switch (MI.getOpcode()) {
+    case ARM::HVC: {
+      // HVC is undefined if condition = 0xf otherwise upredictable
+      // if condition != 0xe
+      uint32_t Cond = (Insn >> 28) & 0xF;
+      if (Cond == 0xF)
+        return MCDisassembler::Fail;
+      if (Cond != 0xE)
+        return MCDisassembler::SoftFail;
+      return Result;
+    }
+    case ARM::t2ADDri:
+    case ARM::t2ADDri12:
+    case ARM::t2ADDrr:
+    case ARM::t2ADDrs:
+    case ARM::t2SUBri:
+    case ARM::t2SUBri12:
+    case ARM::t2SUBrr:
+    case ARM::t2SUBrs:
+      if (MI.getOperand(0).getReg() == ARM::SP &&
+          MI.getOperand(1).getReg() != ARM::SP)
+        return MCDisassembler::SoftFail;
+      return Result;
+    default: return Result;
+  }
+}
+
+uint64_t ARMDisassembler::suggestBytesToSkip(ArrayRef<uint8_t> Bytes,
+                                             uint64_t Address) const {
+  // In Arm state, instructions are always 4 bytes wide, so there's no
+  // point in skipping any smaller number of bytes if an instruction
+  // can't be decoded.
+  if (!STI.hasFeature(ARM::ModeThumb))
+    return 4;
+
+  // In a Thumb instruction stream, a halfword is a standalone 2-byte
+  // instruction if and only if its value is less than 0xE800.
+  // Otherwise, it's the first halfword of a 4-byte instruction.
+  //
+  // So, if we can see the upcoming halfword, we can judge on that
+  // basis, and maybe skip a whole 4-byte instruction that we don't
+  // know how to decode, without accidentally trying to interpret its
+  // second half as something else.
+  //
+  // If we don't have the instruction data available, we just have to
+  // recommend skipping the minimum sensible distance, which is 2
+  // bytes.
+  if (Bytes.size() < 2)
+    return 2;
+
+  uint16_t Insn16 = llvm::support::endian::read<uint16_t>(
+      Bytes.data(), InstructionEndianness);
+  return Insn16 < 0xE800 ? 2 : 4;
+}
+
+DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                             ArrayRef<uint8_t> Bytes,
+                                             uint64_t Address,
+                                             raw_ostream &CS) const {
+  if (STI.hasFeature(ARM::ModeThumb))
+    return getThumbInstruction(MI, Size, Bytes, Address, CS);
+  return getARMInstruction(MI, Size, Bytes, Address, CS);
+}
+
+DecodeStatus ARMDisassembler::getARMInstruction(MCInst &MI, uint64_t &Size,
+                                                ArrayRef<uint8_t> Bytes,
+                                                uint64_t Address,
+                                                raw_ostream &CS) const {
+  CommentStream = &CS;
+
+  assert(!STI.hasFeature(ARM::ModeThumb) &&
+         "Asked to disassemble an ARM instruction but Subtarget is in Thumb "
+         "mode!");
+
+  // We want to read exactly 4 bytes of data.
+  if (Bytes.size() < 4) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  // Encoded as a 32-bit word in the stream.
+  uint32_t Insn = llvm::support::endian::read<uint32_t>(Bytes.data(),
+                                                        InstructionEndianness);
+
+  // Calling the auto-generated decoder function.
+  DecodeStatus Result =
+      decodeInstruction(DecoderTableARM32, MI, Insn, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
+    Size = 4;
+    return checkDecodedInstruction(MI, Size, Address, CS, Insn, Result);
+  }
+
+  struct DecodeTable {
+    const uint8_t *P;
+    bool DecodePred;
+  };
+
+  const DecodeTable Tables[] = {
+      {DecoderTableVFP32, false},      {DecoderTableVFPV832, false},
+      {DecoderTableNEONData32, true},  {DecoderTableNEONLoadStore32, true},
+      {DecoderTableNEONDup32, true},   {DecoderTablev8NEON32, false},
+      {DecoderTablev8Crypto32, false},
+  };
+
+  for (auto Table : Tables) {
+    Result = decodeInstruction(Table.P, MI, Insn, Address, this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      // Add a fake predicate operand, because we share these instruction
+      // definitions with Thumb2 where these instructions are predicable.
+      if (Table.DecodePred && !DecodePredicateOperand(MI, 0xE, Address, this))
+        return MCDisassembler::Fail;
+      return Result;
+    }
+  }
+
+  Result =
+      decodeInstruction(DecoderTableCoProc32, MI, Insn, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
+    Size = 4;
+    return checkDecodedInstruction(MI, Size, Address, CS, Insn, Result);
+  }
+
+  Size = 4;
+  return MCDisassembler::Fail;
+}
+
+// Thumb1 instructions don't have explicit S bits.  Rather, they
+// implicitly set CPSR.  Since it's not represented in the encoding, the
+// auto-generated decoder won't inject the CPSR operand.  We need to fix
+// that as a post-pass.
+void ARMDisassembler::AddThumb1SBit(MCInst &MI, bool InITBlock) const {
+  const MCInstrDesc &MCID = MCII->get(MI.getOpcode());
+  MCInst::iterator I = MI.begin();
+  for (unsigned i = 0; i < MCID.NumOperands; ++i, ++I) {
+    if (I == MI.end()) break;
+    if (MCID.operands()[i].isOptionalDef() &&
+        MCID.operands()[i].RegClass == ARM::CCRRegClassID) {
+      if (i > 0 && MCID.operands()[i - 1].isPredicate())
+        continue;
+      MI.insert(I,
+                MCOperand::createReg(InITBlock ? ARM::NoRegister : ARM::CPSR));
+      return;
+    }
+  }
+
+  MI.insert(I, MCOperand::createReg(InITBlock ? ARM::NoRegister : ARM::CPSR));
+}
+
+bool ARMDisassembler::isVectorPredicable(const MCInst &MI) const {
+  const MCInstrDesc &MCID = MCII->get(MI.getOpcode());
+  for (unsigned i = 0; i < MCID.NumOperands; ++i) {
+    if (ARM::isVpred(MCID.operands()[i].OperandType))
+      return true;
+  }
+  return false;
+}
+
+// Most Thumb instructions don't have explicit predicates in the
+// encoding, but rather get their predicates from IT context.  We need
+// to fix up the predicate operands using this context information as a
+// post-pass.
+MCDisassembler::DecodeStatus
+ARMDisassembler::AddThumbPredicate(MCInst &MI) const {
+  MCDisassembler::DecodeStatus S = Success;
+
+  const FeatureBitset &FeatureBits = getSubtargetInfo().getFeatureBits();
+
+  // A few instructions actually have predicates encoded in them.  Don't
+  // try to overwrite it if we're seeing one of those.
+  switch (MI.getOpcode()) {
+    case ARM::tBcc:
+    case ARM::t2Bcc:
+    case ARM::tCBZ:
+    case ARM::tCBNZ:
+    case ARM::tCPS:
+    case ARM::t2CPS3p:
+    case ARM::t2CPS2p:
+    case ARM::t2CPS1p:
+    case ARM::t2CSEL:
+    case ARM::t2CSINC:
+    case ARM::t2CSINV:
+    case ARM::t2CSNEG:
+    case ARM::tMOVSr:
+    case ARM::tSETEND:
+      // Some instructions (mostly conditional branches) are not
+      // allowed in IT blocks.
+      if (ITBlock.instrInITBlock())
+        S = SoftFail;
+      else
+        return Success;
+      break;
+    case ARM::t2HINT:
+      if (MI.getOperand(0).getImm() == 0x10 && (FeatureBits[ARM::FeatureRAS]) != 0)
+        S = SoftFail;
+      break;
+    case ARM::tB:
+    case ARM::t2B:
+    case ARM::t2TBB:
+    case ARM::t2TBH:
+      // Some instructions (mostly unconditional branches) can
+      // only appears at the end of, or outside of, an IT.
+      if (ITBlock.instrInITBlock() && !ITBlock.instrLastInITBlock())
+        S = SoftFail;
+      break;
+    default:
+      break;
+  }
+
+  // Warn on non-VPT predicable instruction in a VPT block and a VPT
+  // predicable instruction in an IT block
+  if ((!isVectorPredicable(MI) && VPTBlock.instrInVPTBlock()) ||
+      (isVectorPredicable(MI) && ITBlock.instrInITBlock()))
+    S = SoftFail;
+
+  // If we're in an IT/VPT block, base the predicate on that.  Otherwise,
+  // assume a predicate of AL.
+  unsigned CC = ARMCC::AL;
+  unsigned VCC = ARMVCC::None;
+  if (ITBlock.instrInITBlock()) {
+    CC = ITBlock.getITCC();
+    ITBlock.advanceITState();
+  } else if (VPTBlock.instrInVPTBlock()) {
+    VCC = VPTBlock.getVPTPred();
+    VPTBlock.advanceVPTState();
+  }
+
+  const MCInstrDesc &MCID = MCII->get(MI.getOpcode());
+
+  MCInst::iterator CCI = MI.begin();
+  for (unsigned i = 0; i < MCID.NumOperands; ++i, ++CCI) {
+    if (MCID.operands()[i].isPredicate() || CCI == MI.end())
+      break;
+  }
+
+  if (MCID.isPredicable()) {
+    CCI = MI.insert(CCI, MCOperand::createImm(CC));
+    ++CCI;
+    if (CC == ARMCC::AL)
+      MI.insert(CCI, MCOperand::createReg(ARM::NoRegister));
+    else
+      MI.insert(CCI, MCOperand::createReg(ARM::CPSR));
+  } else if (CC != ARMCC::AL) {
+    Check(S, SoftFail);
+  }
+
+  MCInst::iterator VCCI = MI.begin();
+  unsigned VCCPos;
+  for (VCCPos = 0; VCCPos < MCID.NumOperands; ++VCCPos, ++VCCI) {
+    if (ARM::isVpred(MCID.operands()[VCCPos].OperandType) || VCCI == MI.end())
+      break;
+  }
+
+  if (isVectorPredicable(MI)) {
+    VCCI = MI.insert(VCCI, MCOperand::createImm(VCC));
+    ++VCCI;
+    if (VCC == ARMVCC::None)
+      VCCI = MI.insert(VCCI, MCOperand::createReg(0));
+    else
+      VCCI = MI.insert(VCCI, MCOperand::createReg(ARM::P0));
+    ++VCCI;
+    VCCI = MI.insert(VCCI, MCOperand::createReg(0));
+    ++VCCI;
+    if (MCID.operands()[VCCPos].OperandType == ARM::OPERAND_VPRED_R) {
+      int TiedOp = MCID.getOperandConstraint(VCCPos + 3, MCOI::TIED_TO);
+      assert(TiedOp >= 0 &&
+             "Inactive register in vpred_r is not tied to an output!");
+      // Copy the operand to ensure it's not invalidated when MI grows.
+      MI.insert(VCCI, MCOperand(MI.getOperand(TiedOp)));
+    }
+  } else if (VCC != ARMVCC::None) {
+    Check(S, SoftFail);
+  }
+
+  return S;
+}
+
+// Thumb VFP instructions are a special case.  Because we share their
+// encodings between ARM and Thumb modes, and they are predicable in ARM
+// mode, the auto-generated decoder will give them an (incorrect)
+// predicate operand.  We need to rewrite these operands based on the IT
+// context as a post-pass.
+void ARMDisassembler::UpdateThumbVFPPredicate(
+  DecodeStatus &S, MCInst &MI) const {
+  unsigned CC;
+  CC = ITBlock.getITCC();
+  if (CC == 0xF)
+    CC = ARMCC::AL;
+  if (ITBlock.instrInITBlock())
+    ITBlock.advanceITState();
+  else if (VPTBlock.instrInVPTBlock()) {
+    CC = VPTBlock.getVPTPred();
+    VPTBlock.advanceVPTState();
+  }
+
+  const MCInstrDesc &MCID = MCII->get(MI.getOpcode());
+  ArrayRef<MCOperandInfo> OpInfo = MCID.operands();
+  MCInst::iterator I = MI.begin();
+  unsigned short NumOps = MCID.NumOperands;
+  for (unsigned i = 0; i < NumOps; ++i, ++I) {
+    if (OpInfo[i].isPredicate() ) {
+      if (CC != ARMCC::AL && !MCID.isPredicable())
+        Check(S, SoftFail);
+      I->setImm(CC);
+      ++I;
+      if (CC == ARMCC::AL)
+        I->setReg(ARM::NoRegister);
+      else
+        I->setReg(ARM::CPSR);
+      return;
+    }
+  }
+}
+
+DecodeStatus ARMDisassembler::getThumbInstruction(MCInst &MI, uint64_t &Size,
+                                                  ArrayRef<uint8_t> Bytes,
+                                                  uint64_t Address,
+                                                  raw_ostream &CS) const {
+  CommentStream = &CS;
+
+  assert(STI.hasFeature(ARM::ModeThumb) &&
+         "Asked to disassemble in Thumb mode but Subtarget is in ARM mode!");
+
+  // We want to read exactly 2 bytes of data.
+  if (Bytes.size() < 2) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  uint16_t Insn16 = llvm::support::endian::read<uint16_t>(
+      Bytes.data(), InstructionEndianness);
+  DecodeStatus Result =
+      decodeInstruction(DecoderTableThumb16, MI, Insn16, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
+    Size = 2;
+    Check(Result, AddThumbPredicate(MI));
+    return Result;
+  }
+
+  Result = decodeInstruction(DecoderTableThumbSBit16, MI, Insn16, Address, this,
+                             STI);
+  if (Result) {
+    Size = 2;
+    bool InITBlock = ITBlock.instrInITBlock();
+    Check(Result, AddThumbPredicate(MI));
+    AddThumb1SBit(MI, InITBlock);
+    return Result;
+  }
+
+  Result =
+      decodeInstruction(DecoderTableThumb216, MI, Insn16, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
+    Size = 2;
+
+    // Nested IT blocks are UNPREDICTABLE.  Must be checked before we add
+    // the Thumb predicate.
+    if (MI.getOpcode() == ARM::t2IT && ITBlock.instrInITBlock())
+      Result = MCDisassembler::SoftFail;
+
+    Check(Result, AddThumbPredicate(MI));
+
+    // If we find an IT instruction, we need to parse its condition
+    // code and mask operands so that we can apply them correctly
+    // to the subsequent instructions.
+    if (MI.getOpcode() == ARM::t2IT) {
+      unsigned Firstcond = MI.getOperand(0).getImm();
+      unsigned Mask = MI.getOperand(1).getImm();
+      ITBlock.setITState(Firstcond, Mask);
+
+      // An IT instruction that would give a 'NV' predicate is unpredictable.
+      if (Firstcond == ARMCC::AL && !isPowerOf2_32(Mask))
+        CS << "unpredictable IT predicate sequence";
+    }
+
+    return Result;
+  }
+
+  // We want to read exactly 4 bytes of data.
+  if (Bytes.size() < 4) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  uint32_t Insn32 =
+      (uint32_t(Insn16) << 16) | llvm::support::endian::read<uint16_t>(
+                                     Bytes.data() + 2, InstructionEndianness);
+
+  Result =
+      decodeInstruction(DecoderTableMVE32, MI, Insn32, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
+    Size = 4;
+
+    // Nested VPT blocks are UNPREDICTABLE. Must be checked before we add
+    // the VPT predicate.
+    if (isVPTOpcode(MI.getOpcode()) && VPTBlock.instrInVPTBlock())
+      Result = MCDisassembler::SoftFail;
+
+    Check(Result, AddThumbPredicate(MI));
+
+    if (isVPTOpcode(MI.getOpcode())) {
+      unsigned Mask = MI.getOperand(0).getImm();
+      VPTBlock.setVPTState(Mask);
+    }
+
+    return Result;
+  }
+
+  Result =
+      decodeInstruction(DecoderTableThumb32, MI, Insn32, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
+    Size = 4;
+    bool InITBlock = ITBlock.instrInITBlock();
+    Check(Result, AddThumbPredicate(MI));
+    AddThumb1SBit(MI, InITBlock);
+    return Result;
+  }
+
+  Result =
+      decodeInstruction(DecoderTableThumb232, MI, Insn32, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
+    Size = 4;
+    Check(Result, AddThumbPredicate(MI));
+    return checkDecodedInstruction(MI, Size, Address, CS, Insn32, Result);
+  }
+
+  if (fieldFromInstruction(Insn32, 28, 4) == 0xE) {
+    Result =
+        decodeInstruction(DecoderTableVFP32, MI, Insn32, Address, this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      UpdateThumbVFPPredicate(Result, MI);
+      return Result;
+    }
+  }
+
+  Result =
+      decodeInstruction(DecoderTableVFPV832, MI, Insn32, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
+    Size = 4;
+    return Result;
+  }
+
+  if (fieldFromInstruction(Insn32, 28, 4) == 0xE) {
+    Result = decodeInstruction(DecoderTableNEONDup32, MI, Insn32, Address, this,
+                               STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      Check(Result, AddThumbPredicate(MI));
+      return Result;
+    }
+  }
+
+  if (fieldFromInstruction(Insn32, 24, 8) == 0xF9) {
+    uint32_t NEONLdStInsn = Insn32;
+    NEONLdStInsn &= 0xF0FFFFFF;
+    NEONLdStInsn |= 0x04000000;
+    Result = decodeInstruction(DecoderTableNEONLoadStore32, MI, NEONLdStInsn,
+                               Address, this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      Check(Result, AddThumbPredicate(MI));
+      return Result;
+    }
+  }
+
+  if (fieldFromInstruction(Insn32, 24, 4) == 0xF) {
+    uint32_t NEONDataInsn = Insn32;
+    NEONDataInsn &= 0xF0FFFFFF; // Clear bits 27-24
+    NEONDataInsn |= (NEONDataInsn & 0x10000000) >> 4; // Move bit 28 to bit 24
+    NEONDataInsn |= 0x12000000; // Set bits 28 and 25
+    Result = decodeInstruction(DecoderTableNEONData32, MI, NEONDataInsn,
+                               Address, this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      Check(Result, AddThumbPredicate(MI));
+      return Result;
+    }
+
+    uint32_t NEONCryptoInsn = Insn32;
+    NEONCryptoInsn &= 0xF0FFFFFF; // Clear bits 27-24
+    NEONCryptoInsn |= (NEONCryptoInsn & 0x10000000) >> 4; // Move bit 28 to bit 24
+    NEONCryptoInsn |= 0x12000000; // Set bits 28 and 25
+    Result = decodeInstruction(DecoderTablev8Crypto32, MI, NEONCryptoInsn,
+                               Address, this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      return Result;
+    }
+
+    uint32_t NEONv8Insn = Insn32;
+    NEONv8Insn &= 0xF3FFFFFF; // Clear bits 27-26
+    Result = decodeInstruction(DecoderTablev8NEON32, MI, NEONv8Insn, Address,
+                               this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      return Result;
+    }
+  }
+
+  uint32_t Coproc = fieldFromInstruction(Insn32, 8, 4);
+  const uint8_t *DecoderTable = ARM::isCDECoproc(Coproc, STI)
+                                    ? DecoderTableThumb2CDE32
+                                    : DecoderTableThumb2CoProc32;
+  Result =
+      decodeInstruction(DecoderTable, MI, Insn32, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
+    Size = 4;
+    Check(Result, AddThumbPredicate(MI));
+    return Result;
+  }
+
+  // Advance IT state to prevent next instruction inheriting
+  // the wrong IT state.
+  if (ITBlock.instrInITBlock())
+    ITBlock.advanceITState();
+  Size = 0;
+  return MCDisassembler::Fail;
+}
+
+static MCDisassembler *createARMDisassembler(const Target &T,
+                                             const MCSubtargetInfo &STI,
+                                             MCContext &Ctx) {
+  return new ARMDisassembler(STI, Ctx, T.createMCInstrInfo());
+}
+
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeARMDisassembler() {
+  TargetRegistry::RegisterMCDisassembler(getTheARMLETarget(),
+                                         createARMDisassembler);
+  TargetRegistry::RegisterMCDisassembler(getTheARMBETarget(),
+                                         createARMDisassembler);
+  TargetRegistry::RegisterMCDisassembler(getTheThumbLETarget(),
+                                         createARMDisassembler);
+  TargetRegistry::RegisterMCDisassembler(getTheThumbBETarget(),
+                                         createARMDisassembler);
+}
diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp
index bb07d79c9374..50f4042102bf 100644
--- a/llvm/lib/Target/ARM/MVETailPredication.cpp
+++ b/llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -457,5 +457,4 @@ Pass *llvm::createMVETailPredicationPass() {
 
 char MVETailPredication::ID = 0;
 
-INITIALIZE_PASS_BEGIN(MVETailPredication, DEBUG_TYPE, DESC, false, false)
-INITIALIZE_PASS_END(MVETailPredication, DEBUG_TYPE, DESC, false, false)
+INITIALIZE_PASS(MVETailPredication, DEBUG_TYPE, DESC, false, false)
diff --git a/llvm/lib/Target/ARM/README.txt b/llvm/lib/Target/ARM/README.txt
index def67cfae727..ff84e07fa084 100644
--- a/llvm/lib/Target/ARM/README.txt
+++ b/llvm/lib/Target/ARM/README.txt
@@ -697,22 +697,6 @@ target-neutral one.
 
 //===---------------------------------------------------------------------===//
 
-Optimize unnecessary checks for zero with __builtin_clz/ctz.  Those builtins
-are specified to be undefined at zero, so portable code must check for zero
-and handle it as a special case.  That is unnecessary on ARM where those
-operations are implemented in a way that is well-defined for zero.  For
-example:
-
-int f(int x) { return x ? __builtin_clz(x) : sizeof(int)*8; }
-
-should just be implemented with a CLZ instruction.  Since there are other
-targets, e.g., PPC, that share this behavior, it would be best to implement
-this in a target-independent way: we should probably fold that (when using
-"undefined at zero" semantics) to set the "defined at zero" bit and have
-the code generator expand out the right code.
-
-//===---------------------------------------------------------------------===//
-
 Clean up the test/MC/ARM files to have more robust register choices.
 
 R0 should not be used as a register operand in the assembler tests as it's then
diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
index 8b254fafc438..e91441b12fe6 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -263,11 +263,14 @@ void Thumb2InstrInfo::expandLoadStackGuard(
 
   const auto *GV = cast<GlobalValue>((*MI->memoperands_begin())->getValue());
   const ARMSubtarget &Subtarget = MF.getSubtarget<ARMSubtarget>();
+  bool IsPIC = MF.getTarget().isPositionIndependent();
   if (Subtarget.isTargetELF() && !GV->isDSOLocal())
     expandLoadStackGuardBase(MI, ARM::t2LDRLIT_ga_pcrel, ARM::t2LDRi12);
   else if (!Subtarget.useMovt())
-    expandLoadStackGuardBase(MI, ARM::tLDRLIT_ga_abs, ARM::t2LDRi12);
-  else if (MF.getTarget().isPositionIndependent())
+    expandLoadStackGuardBase(
+        MI, IsPIC ? ARM::t2LDRLIT_ga_pcrel : ARM::tLDRLIT_ga_abs,
+        ARM::t2LDRi12);
+  else if (IsPIC)
     expandLoadStackGuardBase(MI, ARM::t2MOV_ga_pcrel, ARM::t2LDRi12);
   else
     expandLoadStackGuardBase(MI, ARM::t2MOVi32imm, ARM::t2LDRi12);
diff --git a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
index 0fb33cdcb62d..ad8f7d801843 100644
--- a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
+++ b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
@@ -245,7 +245,7 @@ void AVRAsmPrinter::emitXXStructor(const DataLayout &DL, const Constant *CV) {
 bool AVRAsmPrinter::doFinalization(Module &M) {
   const TargetLoweringObjectFile &TLOF = getObjFileLowering();
   const AVRTargetMachine &TM = (const AVRTargetMachine &)MMI->getTarget();
-  const AVRSubtarget *SubTM = (const AVRSubtarget *)TM.getSubtargetImpl();
+  const AVRSubtarget *SubTM = TM.getSubtargetImpl();
 
   bool NeedsCopyData = false;
   bool NeedsClearBSS = false;
@@ -294,7 +294,7 @@ bool AVRAsmPrinter::doFinalization(Module &M) {
 
 void AVRAsmPrinter::emitStartOfAsmFile(Module &M) {
   const AVRTargetMachine &TM = (const AVRTargetMachine &)MMI->getTarget();
-  const AVRSubtarget *SubTM = (const AVRSubtarget *)TM.getSubtargetImpl();
+  const AVRSubtarget *SubTM = TM.getSubtargetImpl();
   if (!SubTM)
     return;
 
diff --git a/llvm/lib/Target/AVR/AVRInstrFormats.td b/llvm/lib/Target/AVR/AVRInstrFormats.td
index e1e65b56370c..72ea3bc1f460 100644
--- a/llvm/lib/Target/AVR/AVRInstrFormats.td
+++ b/llvm/lib/Target/AVR/AVRInstrFormats.td
@@ -79,6 +79,7 @@ class FRdRr<bits<4> opcode, bits<2> f, dag outs, dag ins, string asmstr,
 //===----------------------------------------------------------------------===//
 class FZRd<bits<3> t, dag outs, dag ins, string asmstr, list<dag> pattern>
     : AVRInst16<outs, ins, asmstr, pattern> {
+  bits<0> z;
   bits<5> rd;
 
   let Inst{15 - 12} = 0b1001;
@@ -127,8 +128,6 @@ class FRd<bits<4> opcode, bits<7> f, dag outs, dag ins, string asmstr,
   let Inst{11 - 9} = f{6 - 4};
   let Inst{8 - 4} = rd;
   let Inst{3 - 0} = f{3 - 0};
-
-  let DecoderMethod = "decodeFRd";
 }
 
 //===----------------------------------------------------------------------===//
@@ -200,57 +199,64 @@ class FSTLD<bit type, bits<2> mode, dag outs, dag ins, string asmstr,
 //===---------------------------------------------------------------------===//
 class FLPMX<bit e, bit p, dag outs, dag ins, string asmstr, list<dag> pattern>
     : AVRInst16<outs, ins, asmstr, pattern> {
+  bits<0> z;
   bits<5> rd;
 
-  let Inst{15 - 12} = 0b1001;
-
-  let Inst{11 - 9} = 0b000;
-  let Inst{8} = rd{4};
-
-  let Inst{7 - 4} = rd{3 - 0};
-
+  let Inst{15 - 9} = 0b1001000;
+  let Inst{8 - 4} = rd;
   let Inst{3 - 2} = 0b01;
   let Inst{1} = e;
   let Inst{0} = p;
-
-  let DecoderMethod = "decodeFLPMX";
 }
 
 //===----------------------------------------------------------------------===//
 // MOVWRdRr special encoding: <|0000|0001|dddd|rrrr|>
 // d = destination = 4 bits
 // r = source = 4 bits
-// (Only accepts even registers)
+// (Only accepts register pairs)
 //===----------------------------------------------------------------------===//
 class FMOVWRdRr<dag outs, dag ins, string asmstr, list<dag> pattern>
     : AVRInst16<outs, ins, asmstr, pattern> {
-  bits<5> rd;
-  bits<5> rr;
+  bits<4> rd;
+  bits<4> rr;
 
   let Inst{15 - 8} = 0b00000001;
-  let Inst{7 - 4} = rd{4 - 1};
-  let Inst{3 - 0} = rr{4 - 1};
-
-  let DecoderMethod = "decodeFMOVWRdRr";
+  let Inst{7 - 4} = rd;
+  let Inst{3 - 0} = rr;
 }
 
 //===----------------------------------------------------------------------===//
-// MULSrr special encoding: <|0000|0010|dddd|rrrr|>
+// MULS special encoding: <|0000|0010|dddd|rrrr|>
 // d = multiplicand = 4 bits
 // r = multiplier = 4 bits
 // (Only accepts r16-r31)
 //===----------------------------------------------------------------------===//
-class FMUL2RdRr<bit f, dag outs, dag ins, string asmstr, list<dag> pattern>
+class FMULSRdRr<dag outs, dag ins, string asmstr, list<dag> pattern>
     : AVRInst16<outs, ins, asmstr, pattern> {
-  bits<5> rd; // accept 5 bits but only encode the lower 4
-  bits<5> rr; // accept 5 bits but only encode the lower 4
+  bits<4> rd;
+  bits<4> rr;
 
-  let Inst{15 - 9} = 0b0000001;
-  let Inst{8} = f;
-  let Inst{7 - 4} = rd{3 - 0};
-  let Inst{3 - 0} = rr{3 - 0};
+  let Inst{15 - 8} = 0b00000010;
+  let Inst{7 - 4} = rd;
+  let Inst{3 - 0} = rr;
+}
 
-  let DecoderMethod = "decodeFMUL2RdRr";
+//===----------------------------------------------------------------------===//
+// MULSU special encoding: <|0000|0011|0ddd|0rrr|>
+// d = multiplicand = 3 bits
+// r = multiplier = 3 bits
+// (Only accepts r16-r23)
+//===----------------------------------------------------------------------===//
+class FMULSURdRr<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : AVRInst16<outs, ins, asmstr, pattern> {
+  bits<3> rd;
+  bits<3> rr;
+
+  let Inst{15 - 8} = 0b00000011;
+  let Inst{7} = 0;
+  let Inst{6 - 4} = rd;
+  let Inst{3} = 0;
+  let Inst{2 - 0} = rr;
 }
 
 // Special encoding for the FMUL family of instructions.
@@ -273,8 +279,6 @@ class FFMULRdRr<bits<2> f, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{6 - 4} = rd;
   let Inst{3} = f{0};
   let Inst{2 - 0} = rr;
-
-  let DecoderMethod = "decodeFFMULRdRr";
 }
 
 //===----------------------------------------------------------------------===//
@@ -286,16 +290,14 @@ class FFMULRdRr<bits<2> f, dag outs, dag ins, string asmstr, list<dag> pattern>
 //===----------------------------------------------------------------------===//
 class FWRdK<bit f, dag outs, dag ins, string asmstr, list<dag> pattern>
     : AVRInst16<outs, ins, asmstr, pattern> {
-  bits<5> rd; // accept 5 bits but only encode bits 1 and 2
+  bits<2> rd;
   bits<6> k;
 
   let Inst{15 - 9} = 0b1001011;
   let Inst{8} = f;
   let Inst{7 - 6} = k{5 - 4};
-  let Inst{5 - 4} = rd{2 - 1};
+  let Inst{5 - 4} = rd;
   let Inst{3 - 0} = k{3 - 0};
-
-  let DecoderMethod = "decodeFWRdK";
 }
 
 //===----------------------------------------------------------------------===//
@@ -313,8 +315,6 @@ class FIORdA<dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{10 - 9} = A{5 - 4};
   let Inst{8 - 4} = rd;
   let Inst{3 - 0} = A{3 - 0};
-
-  let DecoderMethod = "decodeFIORdA";
 }
 
 //===----------------------------------------------------------------------===//
@@ -332,8 +332,6 @@ class FIOARr<dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{10 - 9} = A{5 - 4};
   let Inst{8 - 4} = rr;
   let Inst{3 - 0} = A{3 - 0};
-
-  let DecoderMethod = "decodeFIOARr";
 }
 
 //===----------------------------------------------------------------------===//
@@ -348,17 +346,10 @@ class FIOBIT<bits<2> t, dag outs, dag ins, string asmstr, list<dag> pattern>
   bits<5> addr;
   bits<3> b;
 
-  let Inst{15 - 12} = 0b1001;
-
-  let Inst{11 - 10} = 0b10;
+  let Inst{15 - 10} = 0b100110;
   let Inst{9 - 8} = t;
-
-  let Inst{7 - 4} = addr{4 - 1};
-
-  let Inst{3} = addr{0};
+  let Inst{7 - 3} = addr;
   let Inst{2 - 0} = b{2 - 0};
-
-  let DecoderMethod = "decodeFIOBIT";
 }
 
 //===----------------------------------------------------------------------===//
@@ -417,8 +408,6 @@ class FBRsk<bit f, bits<3> s, dag outs, dag ins, string asmstr,
   let Inst{10} = f;
   let Inst{9 - 3} = k;
   let Inst{2 - 0} = s;
-
-  let DecoderMethod = "decodeCondBranch";
 }
 
 //===----------------------------------------------------------------------===//
@@ -442,8 +431,6 @@ class FBRk<bit f, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{15 - 13} = 0b110;
   let Inst{12} = f;
   let Inst{11 - 0} = k;
-
-  let DecoderMethod = "decodeFBRk";
 }
 
 //===----------------------------------------------------------------------===//
@@ -537,14 +524,8 @@ class FSK<bit f, dag outs, dag ins, string asmstr, list<dag> pattern>
 
   let Inst{11} = 0;
   let Inst{10} = f;
-  let Inst{9 - 8} = k{6 - 5};
-
-  let Inst{7 - 4} = k{4 - 1};
-
-  let Inst{3} = k{0};
+  let Inst{9 - 3} = k;
   let Inst{2 - 0} = s;
-
-  let DecoderMethod = "decodeCondBranch";
 }
 
 class ExtensionPseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.cpp b/llvm/lib/Target/AVR/AVRInstrInfo.cpp
index 601068bf1793..ce9908597dca 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.cpp
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.cpp
@@ -29,8 +29,8 @@
 
 namespace llvm {
 
-AVRInstrInfo::AVRInstrInfo(AVRSubtarget &STI)
-    : AVRGenInstrInfo(AVR::ADJCALLSTACKDOWN, AVR::ADJCALLSTACKUP), RI(),
+AVRInstrInfo::AVRInstrInfo(const AVRSubtarget &STI)
+    : AVRGenInstrInfo(STI, AVR::ADJCALLSTACKDOWN, AVR::ADJCALLSTACKUP), RI(),
       STI(STI) {}
 
 void AVRInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.h b/llvm/lib/Target/AVR/AVRInstrInfo.h
index 1c92f173d254..759aea201096 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.h
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.h
@@ -65,7 +65,7 @@ enum TOF {
 /// Utilities related to the AVR instruction set.
 class AVRInstrInfo : public AVRGenInstrInfo {
 public:
-  explicit AVRInstrInfo(AVRSubtarget &STI);
+  explicit AVRInstrInfo(const AVRSubtarget &STI);
 
   const AVRRegisterInfo &getRegisterInfo() const { return RI; }
   const MCInstrDesc &getBrCond(AVRCC::CondCodes CC) const;
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.td b/llvm/lib/Target/AVR/AVRInstrInfo.td
index 958e1383acef..02fb905f5fb6 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.td
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.td
@@ -204,16 +204,19 @@ def memspi : Operand<iPTR> {
 def relbrtarget_7 : Operand<OtherVT> {
   let PrintMethod = "printPCRelImm";
   let EncoderMethod = "encodeRelCondBrTarget<AVR::fixup_7_pcrel>";
+  let DecoderMethod = "decodeRelCondBrTarget7";
 }
 
 def brtarget_13 : Operand<OtherVT> {
   let PrintMethod = "printPCRelImm";
   let EncoderMethod = "encodeRelCondBrTarget<AVR::fixup_13_pcrel>";
+  let DecoderMethod = "decodeRelCondBrTarget13";
 }
 
 def rcalltarget_13 : Operand<i16> {
   let PrintMethod = "printPCRelImm";
   let EncoderMethod = "encodeRelCondBrTarget<AVR::fixup_13_pcrel>";
+  let DecoderMethod = "decodeRelCondBrTarget13";
 }
 
 // The target of a 22 or 16-bit call/jmp instruction.
@@ -492,13 +495,13 @@ let isCommutable = 1, Defs = [R1, R0, SREG] in {
                         "mul\t$rd, $rr", []>,
                   Requires<[SupportsMultiplication]>;
 
-    def MULSRdRr : FMUL2RdRr<0, (outs), (ins LD8:$rd, LD8:$rr),
+    def MULSRdRr : FMULSRdRr<(outs), (ins LD8:$rd, LD8:$rr),
                              "muls\t$rd, $rr", []>,
                    Requires<[SupportsMultiplication]>;
   }
 
-  def MULSURdRr : FMUL2RdRr<1, (outs), (ins LD8lo:$rd, LD8lo:$rr),
-                            "mulsu\t$rd, $rr", []>,
+  def MULSURdRr : FMULSURdRr<(outs), (ins LD8lo:$rd, LD8lo:$rr),
+                             "mulsu\t$rd, $rr", []>,
                   Requires<[SupportsMultiplication]>;
 
   def FMUL : FFMULRdRr<0b01, (outs), (ins LD8lo:$rd, LD8lo:$rr),
@@ -1230,7 +1233,9 @@ let Uses = [R1, R0] in {
 
   let Defs = [R31R30] in 
   def SPMZPi : F16<0b1001010111111000, (outs), (ins ZREG:$z), "spm $z+", []>,
-               Requires<[HasSPMX]>;
+               Requires<[HasSPMX]> {
+    bits<0> z;
+  }
 }
 
 // Read data from IO location operations.
diff --git a/llvm/lib/Target/AVR/AVRRegisterInfo.td b/llvm/lib/Target/AVR/AVRRegisterInfo.td
index 21b4aedea44c..182f92c684dc 100644
--- a/llvm/lib/Target/AVR/AVRRegisterInfo.td
+++ b/llvm/lib/Target/AVR/AVRRegisterInfo.td
@@ -68,33 +68,37 @@ def R31 : AVRReg<31, "r31", [], ["zh"]>, DwarfRegNum<[31]>;
 def SPL : AVRReg<32, "SPL">, DwarfRegNum<[32]>;
 def SPH : AVRReg<33, "SPH">, DwarfRegNum<[33]>;
 
+// 16 bit GPR pairs.
 let SubRegIndices = [sub_lo, sub_hi], CoveredBySubRegs = 1 in {
-  // 16 bit GPR pairs.
-  def SP : AVRReg<32, "SP", [SPL, SPH]>, DwarfRegNum<[32]>;
+  // The value 16 for the encoding is arbitrary. SP register is not encoded
+  // into instructions, they use it implicitly depending on the opcode.
+  def SP : AVRReg<16, "SP", [SPL, SPH]>, DwarfRegNum<[32]>;
 
   // The pointer registers (X,Y,Z) are a special case because they
   // are printed as a `high:low` pair when a DREG is expected,
   // but printed using `X`, `Y`, `Z` when a pointer register is expected.
+  // DREG registers are only used in ADIW, SBIW and MOVW instructions.
   let RegAltNameIndices = [ptr] in {
-    def R31R30 : AVRReg<30, "r31:r30", [R30, R31], ["Z"]>, DwarfRegNum<[30]>;
-    def R29R28 : AVRReg<28, "r29:r28", [R28, R29], ["Y"]>, DwarfRegNum<[28]>;
-    def R27R26 : AVRReg<26, "r27:r26", [R26, R27], ["X"]>, DwarfRegNum<[26]>;
+    def R31R30 : AVRReg<15, "r31:r30", [R30, R31], ["Z"]>, DwarfRegNum<[30]>;
+    def R29R28 : AVRReg<14, "r29:r28", [R28, R29], ["Y"]>, DwarfRegNum<[28]>;
+    def R27R26 : AVRReg<13, "r27:r26", [R26, R27], ["X"]>, DwarfRegNum<[26]>;
   }
-  def R25R24 : AVRReg<24, "r25:r24", [R24, R25]>, DwarfRegNum<[24]>;
-  def R23R22 : AVRReg<22, "r23:r22", [R22, R23]>, DwarfRegNum<[22]>;
-  def R21R20 : AVRReg<20, "r21:r20", [R20, R21]>, DwarfRegNum<[20]>;
-  def R19R18 : AVRReg<18, "r19:r18", [R18, R19]>, DwarfRegNum<[18]>;
-  def R17R16 : AVRReg<16, "r17:r16", [R16, R17]>, DwarfRegNum<[16]>;
-  def R15R14 : AVRReg<14, "r15:r14", [R14, R15]>, DwarfRegNum<[14]>;
-  def R13R12 : AVRReg<12, "r13:r12", [R12, R13]>, DwarfRegNum<[12]>;
-  def R11R10 : AVRReg<10, "r11:r10", [R10, R11]>, DwarfRegNum<[10]>;
-  def R9R8 : AVRReg<8, "r9:r8", [R8, R9]>, DwarfRegNum<[8]>;
-  def R7R6 : AVRReg<6, "r7:r6", [R6, R7]>, DwarfRegNum<[6]>;
-  def R5R4 : AVRReg<4, "r5:r4", [R4, R5]>, DwarfRegNum<[4]>;
-  def R3R2 : AVRReg<2, "r3:r2", [R2, R3]>, DwarfRegNum<[2]>;
+  def R25R24 : AVRReg<12, "r25:r24", [R24, R25]>, DwarfRegNum<[24]>;
+  def R23R22 : AVRReg<11, "r23:r22", [R22, R23]>, DwarfRegNum<[22]>;
+  def R21R20 : AVRReg<10, "r21:r20", [R20, R21]>, DwarfRegNum<[20]>;
+  def R19R18 : AVRReg<9, "r19:r18", [R18, R19]>, DwarfRegNum<[18]>;
+  def R17R16 : AVRReg<8, "r17:r16", [R16, R17]>, DwarfRegNum<[16]>;
+  def R15R14 : AVRReg<7, "r15:r14", [R14, R15]>, DwarfRegNum<[14]>;
+  def R13R12 : AVRReg<6, "r13:r12", [R12, R13]>, DwarfRegNum<[12]>;
+  def R11R10 : AVRReg<5, "r11:r10", [R10, R11]>, DwarfRegNum<[10]>;
+  def R9R8 : AVRReg<4, "r9:r8", [R8, R9]>, DwarfRegNum<[8]>;
+  def R7R6 : AVRReg<3, "r7:r6", [R6, R7]>, DwarfRegNum<[6]>;
+  def R5R4 : AVRReg<2, "r5:r4", [R4, R5]>, DwarfRegNum<[4]>;
+  def R3R2 : AVRReg<1, "r3:r2", [R2, R3]>, DwarfRegNum<[2]>;
   def R1R0 : AVRReg<0, "r1:r0", [R0, R1]>, DwarfRegNum<[0]>;
 
-  // Pseudo registers for unaligned i16
+  // Pseudo registers for unaligned i16. These are only used in pseudo
+  // instructions, so encoding values are arbitrary.
   def R26R25 : AVRReg<25, "r26:r25", [R25, R26]>, DwarfRegNum<[25]>;
   def R24R23 : AVRReg<23, "r24:r23", [R23, R24]>, DwarfRegNum<[23]>;
   def R22R21 : AVRReg<21, "r22:r21", [R21, R22]>, DwarfRegNum<[21]>;
diff --git a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
index 4e00b192b875..a8650146e988 100644
--- a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
+++ b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
@@ -38,7 +38,6 @@ using namespace llvm;
 namespace {
 /// Parses AVR assembly from a stream.
 class AVRAsmParser : public MCTargetAsmParser {
-  const MCSubtargetInfo &STI;
   MCAsmParser &Parser;
   const MCRegisterInfo *MRI;
   const std::string GENERATE_STUBS = "gs";
@@ -93,7 +92,7 @@ class AVRAsmParser : public MCTargetAsmParser {
 public:
   AVRAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
                const MCInstrInfo &MII, const MCTargetOptions &Options)
-      : MCTargetAsmParser(Options, STI, MII), STI(STI), Parser(Parser) {
+      : MCTargetAsmParser(Options, STI, MII), Parser(Parser) {
     MCAsmParserExtension::Initialize(Parser);
     MRI = getContext().getRegisterInfo();
 
@@ -318,7 +317,7 @@ bool AVRAsmParser::missingFeature(llvm::SMLoc const &Loc,
 
 bool AVRAsmParser::emit(MCInst &Inst, SMLoc const &Loc, MCStreamer &Out) const {
   Inst.setLoc(Loc);
-  Out.emitInstruction(Inst, STI);
+  Out.emitInstruction(Inst, *STI);
 
   return false;
 }
@@ -411,7 +410,7 @@ bool AVRAsmParser::tryParseRegisterOperand(OperandVector &Operands) {
 
   // Reject R0~R15 on avrtiny.
   if (AVR::R0 <= Reg && Reg <= AVR::R15 &&
-      STI.hasFeature(AVR::FeatureTinyEncoding))
+      STI->hasFeature(AVR::FeatureTinyEncoding))
     return Error(Parser.getTok().getLoc(), "invalid register on avrtiny");
 
   AsmToken const &T = Parser.getTok();
@@ -758,7 +757,7 @@ unsigned AVRAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
 
       // Reject R0~R15 on avrtiny.
       if (0 <= RegNum && RegNum <= 15 &&
-          STI.hasFeature(AVR::FeatureTinyEncoding))
+          STI->hasFeature(AVR::FeatureTinyEncoding))
         return Match_InvalidRegisterOnTiny;
 
       std::ostringstream RegName;
diff --git a/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp b/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
index 948588cb9a75..3a840a371497 100644
--- a/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
+++ b/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
@@ -61,7 +61,7 @@ LLVMInitializeAVRDisassembler() {
                                          createAVRDisassembler);
 }
 
-static const uint16_t GPRDecoderTable[] = {
+static constexpr MCRegister GPRDecoderTable[] = {
     AVR::R0,  AVR::R1,  AVR::R2,  AVR::R3,  AVR::R4,  AVR::R5,  AVR::R6,
     AVR::R7,  AVR::R8,  AVR::R9,  AVR::R10, AVR::R11, AVR::R12, AVR::R13,
     AVR::R14, AVR::R15, AVR::R16, AVR::R17, AVR::R18, AVR::R19, AVR::R20,
@@ -69,6 +69,13 @@ static const uint16_t GPRDecoderTable[] = {
     AVR::R28, AVR::R29, AVR::R30, AVR::R31,
 };
 
+static constexpr MCRegister GPRPairDecoderTable[] = {
+    AVR::R1R0,   AVR::R3R2,   AVR::R5R4,   AVR::R7R6,
+    AVR::R9R8,   AVR::R11R10, AVR::R13R12, AVR::R15R14,
+    AVR::R17R16, AVR::R19R18, AVR::R21R20, AVR::R23R22,
+    AVR::R25R24, AVR::R27R26, AVR::R29R28, AVR::R31R30,
+};
+
 static DecodeStatus DecodeGPR8RegisterClass(MCInst &Inst, unsigned RegNo,
                                             uint64_t Address,
                                             const MCDisassembler *Decoder) {
@@ -83,96 +90,41 @@ static DecodeStatus DecodeGPR8RegisterClass(MCInst &Inst, unsigned RegNo,
 static DecodeStatus DecodeLD8RegisterClass(MCInst &Inst, unsigned RegNo,
                                            uint64_t Address,
                                            const MCDisassembler *Decoder) {
-  if (RegNo > 15)
-    return MCDisassembler::Fail;
-
-  unsigned Register = GPRDecoderTable[RegNo + 16];
-  Inst.addOperand(MCOperand::createReg(Register));
+  assert(isUInt<4>(RegNo));
+  // Only r16...r31 are legal.
+  Inst.addOperand(MCOperand::createReg(GPRDecoderTable[16 + RegNo]));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus decodeFIOARr(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                 const MCDisassembler *Decoder);
-
-static DecodeStatus decodeFIORdA(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                 const MCDisassembler *Decoder);
-
-static DecodeStatus decodeFIOBIT(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                 const MCDisassembler *Decoder);
-
-static DecodeStatus decodeCallTarget(MCInst &Inst, unsigned Insn,
-                                     uint64_t Address,
-                                     const MCDisassembler *Decoder);
-
-static DecodeStatus decodeFRd(MCInst &Inst, unsigned Insn, uint64_t Address,
-                              const MCDisassembler *Decoder);
-
-static DecodeStatus decodeFLPMX(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                const MCDisassembler *Decoder);
-
-static DecodeStatus decodeFFMULRdRr(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address,
-                                    const MCDisassembler *Decoder);
-
-static DecodeStatus decodeFMOVWRdRr(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address,
-                                    const MCDisassembler *Decoder);
-
-static DecodeStatus decodeFWRdK(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                const MCDisassembler *Decoder);
-
-static DecodeStatus decodeFMUL2RdRr(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address,
-                                    const MCDisassembler *Decoder);
-
-static DecodeStatus decodeMemri(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                const MCDisassembler *Decoder);
-
-static DecodeStatus decodeFBRk(MCInst &Inst, unsigned Insn, uint64_t Address,
-                               const MCDisassembler *Decoder);
-
-static DecodeStatus decodeCondBranch(MCInst &Inst, unsigned Insn,
-                                     uint64_t Address,
-                                     const MCDisassembler *Decoder);
-
-static DecodeStatus decodeLoadStore(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address,
-                                    const MCDisassembler *Decoder);
-
-#include "AVRGenDisassemblerTables.inc"
+static DecodeStatus DecodeLD8loRegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder) {
+  assert(isUInt<3>(RegNo));
+  // Only r16...r23 are legal.
+  Inst.addOperand(MCOperand::createReg(GPRDecoderTable[16 + RegNo]));
+  return MCDisassembler::Success;
+}
 
-static DecodeStatus decodeFIOARr(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                 const MCDisassembler *Decoder) {
-  unsigned addr = 0;
-  addr |= fieldFromInstruction(Insn, 0, 4);
-  addr |= fieldFromInstruction(Insn, 9, 2) << 4;
-  unsigned reg = fieldFromInstruction(Insn, 4, 5);
-  Inst.addOperand(MCOperand::createImm(addr));
-  if (DecodeGPR8RegisterClass(Inst, reg, Address, Decoder) ==
-      MCDisassembler::Fail)
-    return MCDisassembler::Fail;
+static DecodeStatus DecodeDREGSRegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const MCDisassembler *Decoder) {
+  assert(isUInt<4>(RegNo));
+  Inst.addOperand(MCOperand::createReg(GPRPairDecoderTable[RegNo]));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus decodeFIORdA(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                 const MCDisassembler *Decoder) {
-  unsigned addr = 0;
-  addr |= fieldFromInstruction(Insn, 0, 4);
-  addr |= fieldFromInstruction(Insn, 9, 2) << 4;
-  unsigned reg = fieldFromInstruction(Insn, 4, 5);
-  if (DecodeGPR8RegisterClass(Inst, reg, Address, Decoder) ==
-      MCDisassembler::Fail)
-    return MCDisassembler::Fail;
-  Inst.addOperand(MCOperand::createImm(addr));
+static DecodeStatus DecodeIWREGSRegisterClass(MCInst &Inst, unsigned RegNo,
+                                              uint64_t Address,
+                                              const MCDisassembler *Decoder) {
+  assert(isUInt<2>(RegNo));
+  // Only AVR::R25R24, AVR::R27R26, AVR::R29R28, AVR::R31R30 are legal.
+  Inst.addOperand(MCOperand::createReg(GPRPairDecoderTable[12 + RegNo]));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus decodeFIOBIT(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                 const MCDisassembler *Decoder) {
-  unsigned addr = fieldFromInstruction(Insn, 3, 5);
-  unsigned b = fieldFromInstruction(Insn, 0, 3);
-  Inst.addOperand(MCOperand::createImm(addr));
-  Inst.addOperand(MCOperand::createImm(b));
+static DecodeStatus DecodeZREGRegisterClass(MCInst &Inst,
+                                            const MCDisassembler *Decoder) {
+  Inst.addOperand(MCOperand::createReg(AVR::R31R30));
   return MCDisassembler::Success;
 }
 
@@ -185,78 +137,19 @@ static DecodeStatus decodeCallTarget(MCInst &Inst, unsigned Field,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus decodeFRd(MCInst &Inst, unsigned Insn, uint64_t Address,
-                              const MCDisassembler *Decoder) {
-  unsigned d = fieldFromInstruction(Insn, 4, 5);
-  if (DecodeGPR8RegisterClass(Inst, d, Address, Decoder) ==
-      MCDisassembler::Fail)
-    return MCDisassembler::Fail;
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus decodeFLPMX(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                const MCDisassembler *Decoder) {
-  if (decodeFRd(Inst, Insn, Address, Decoder) == MCDisassembler::Fail)
-    return MCDisassembler::Fail;
-  Inst.addOperand(MCOperand::createReg(AVR::R31R30));
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus decodeFFMULRdRr(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address,
-                                    const MCDisassembler *Decoder) {
-  unsigned d = fieldFromInstruction(Insn, 4, 3) + 16;
-  unsigned r = fieldFromInstruction(Insn, 0, 3) + 16;
-  if (DecodeGPR8RegisterClass(Inst, d, Address, Decoder) ==
-      MCDisassembler::Fail)
-    return MCDisassembler::Fail;
-  if (DecodeGPR8RegisterClass(Inst, r, Address, Decoder) ==
-      MCDisassembler::Fail)
-    return MCDisassembler::Fail;
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus decodeFMOVWRdRr(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address,
-                                    const MCDisassembler *Decoder) {
-  unsigned r = fieldFromInstruction(Insn, 4, 4) * 2;
-  unsigned d = fieldFromInstruction(Insn, 0, 4) * 2;
-  if (DecodeGPR8RegisterClass(Inst, r, Address, Decoder) ==
-      MCDisassembler::Fail)
-    return MCDisassembler::Fail;
-  if (DecodeGPR8RegisterClass(Inst, d, Address, Decoder) ==
-      MCDisassembler::Fail)
-    return MCDisassembler::Fail;
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus decodeFWRdK(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                const MCDisassembler *Decoder) {
-  unsigned d = fieldFromInstruction(Insn, 4, 2) * 2 + 24; // starts at r24:r25
-  unsigned k = 0;
-  k |= fieldFromInstruction(Insn, 0, 4);
-  k |= fieldFromInstruction(Insn, 6, 2) << 4;
-  if (DecodeGPR8RegisterClass(Inst, d, Address, Decoder) ==
-      MCDisassembler::Fail)
-    return MCDisassembler::Fail;
-  if (DecodeGPR8RegisterClass(Inst, d, Address, Decoder) ==
-      MCDisassembler::Fail)
-    return MCDisassembler::Fail;
-  Inst.addOperand(MCOperand::createImm(k));
+static DecodeStatus decodeRelCondBrTarget7(MCInst &Inst, unsigned Field,
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder) {
+  // The legal range is [-128, 126] (in bytes).
+  Inst.addOperand(MCOperand::createImm(SignExtend32(Field, 7) * 2));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus decodeFMUL2RdRr(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address,
-                                    const MCDisassembler *Decoder) {
-  unsigned rd = fieldFromInstruction(Insn, 4, 4) + 16;
-  unsigned rr = fieldFromInstruction(Insn, 0, 4) + 16;
-  if (DecodeGPR8RegisterClass(Inst, rd, Address, Decoder) ==
-      MCDisassembler::Fail)
-    return MCDisassembler::Fail;
-  if (DecodeGPR8RegisterClass(Inst, rr, Address, Decoder) ==
-      MCDisassembler::Fail)
-    return MCDisassembler::Fail;
+static DecodeStatus decodeRelCondBrTarget13(MCInst &Inst, unsigned Field,
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder) {
+  // The legal range is [-4096, 4094] (in bytes).
+  Inst.addOperand(MCOperand::createImm(SignExtend32(Field, 12) * 2));
   return MCDisassembler::Success;
 }
 
@@ -277,59 +170,6 @@ static DecodeStatus decodeMemri(MCInst &Inst, unsigned Insn, uint64_t Address,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus decodeFBRk(MCInst &Inst, unsigned Insn, uint64_t Address,
-                               const MCDisassembler *Decoder) {
-  // Decode the opcode.
-  switch (Insn & 0xf000) {
-  case 0xc000:
-    Inst.setOpcode(AVR::RJMPk);
-    break;
-  case 0xd000:
-    Inst.setOpcode(AVR::RCALLk);
-    break;
-  default: // Unknown relative branch instruction.
-    return MCDisassembler::Fail;
-  }
-  // Decode the relative offset.
-  int16_t Offset = ((int16_t)((Insn & 0xfff) << 4)) >> 3;
-  Inst.addOperand(MCOperand::createImm(Offset));
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus decodeCondBranch(MCInst &Inst, unsigned Insn,
-                                     uint64_t Address,
-                                     const MCDisassembler *Decoder) {
-  // These 8 instructions are not defined as aliases of BRBS/BRBC.
-  DenseMap<unsigned, unsigned> brInsts = {
-      {0x000, AVR::BRLOk}, {0x400, AVR::BRSHk}, {0x001, AVR::BREQk},
-      {0x401, AVR::BRNEk}, {0x002, AVR::BRMIk}, {0x402, AVR::BRPLk},
-      {0x004, AVR::BRLTk}, {0x404, AVR::BRGEk}};
-
-  // Get the relative offset.
-  int16_t Offset = ((int16_t)((Insn & 0x3f8) << 6)) >> 8;
-
-  // Search the instruction pattern.
-  auto NotAlias = [&Insn](const std::pair<unsigned, unsigned> &I) {
-    return (Insn & 0x407) != I.first;
-  };
-  llvm::partition(brInsts, NotAlias);
-  auto It = llvm::partition_point(brInsts, NotAlias);
-
-  // Decode the instruction.
-  if (It != brInsts.end()) {
-    // This instruction is not an alias of BRBC/BRBS.
-    Inst.setOpcode(It->second);
-    Inst.addOperand(MCOperand::createImm(Offset));
-  } else {
-    // Fall back to an ordinary BRBS/BRBC.
-    Inst.setOpcode(Insn & 0x400 ? AVR::BRBCsk : AVR::BRBSsk);
-    Inst.addOperand(MCOperand::createImm(Insn & 7));
-    Inst.addOperand(MCOperand::createImm(Offset));
-  }
-
-  return MCDisassembler::Success;
-}
-
 static DecodeStatus decodeLoadStore(MCInst &Inst, unsigned Insn,
                                     uint64_t Address,
                                     const MCDisassembler *Decoder) {
@@ -435,6 +275,8 @@ static DecodeStatus decodeLoadStore(MCInst &Inst, unsigned Insn,
   return MCDisassembler::Success;
 }
 
+#include "AVRGenDisassemblerTables.inc"
+
 static DecodeStatus readInstruction16(ArrayRef<uint8_t> Bytes, uint64_t Address,
                                       uint64_t &Size, uint32_t &Insn) {
   if (Bytes.size() < 2) {
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp
index 481219164a0f..5adffeed04bd 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp
@@ -101,23 +101,6 @@ const char *AVRInstPrinter::getPrettyRegisterName(MCRegister Reg,
 void AVRInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
                                   raw_ostream &O) {
   const MCOperandInfo &MOI = this->MII.get(MI->getOpcode()).operands()[OpNo];
-  if (MOI.RegClass == AVR::ZREGRegClassID) {
-    // Special case for the Z register, which sometimes doesn't have an operand
-    // in the MCInst.
-    O << "Z";
-    return;
-  }
-
-  if (OpNo >= MI->size()) {
-    // Not all operands are correctly disassembled at the moment. This means
-    // that some machine instructions won't have all the necessary operands
-    // set.
-    // To avoid asserting, print <unknown> instead until the necessary support
-    // has been implemented.
-    O << "<unknown>";
-    return;
-  }
-
   const MCOperand &Op = MI->getOperand(OpNo);
 
   if (Op.isReg()) {
diff --git a/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp b/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
index 352017e9b929..dadba52de462 100644
--- a/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
+++ b/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
@@ -193,27 +193,6 @@ void BPFDAGToDAGISel::Select(SDNode *Node) {
   switch (Opcode) {
   default:
     break;
-  case ISD::INTRINSIC_W_CHAIN: {
-    unsigned IntNo = Node->getConstantOperandVal(1);
-    switch (IntNo) {
-    case Intrinsic::bpf_load_byte:
-    case Intrinsic::bpf_load_half:
-    case Intrinsic::bpf_load_word: {
-      SDLoc DL(Node);
-      SDValue Chain = Node->getOperand(0);
-      SDValue N1 = Node->getOperand(1);
-      SDValue Skb = Node->getOperand(2);
-      SDValue N3 = Node->getOperand(3);
-
-      SDValue R6Reg = CurDAG->getRegister(BPF::R6, MVT::i64);
-      Chain = CurDAG->getCopyToReg(Chain, DL, R6Reg, Skb, SDValue());
-      Node = CurDAG->UpdateNodeOperands(Node, Chain, N1, R6Reg, N3);
-      break;
-    }
-    }
-    break;
-  }
-
   case ISD::FrameIndex: {
     int FI = cast<FrameIndexSDNode>(Node)->getIndex();
     EVT VT = Node->getValueType(0);
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.cpp b/llvm/lib/Target/BPF/BPFInstrInfo.cpp
index 70bc163615f6..fb4efcfe8614 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.cpp
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.cpp
@@ -12,6 +12,7 @@
 
 #include "BPFInstrInfo.h"
 #include "BPF.h"
+#include "BPFSubtarget.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -25,8 +26,8 @@
 
 using namespace llvm;
 
-BPFInstrInfo::BPFInstrInfo()
-    : BPFGenInstrInfo(BPF::ADJCALLSTACKDOWN, BPF::ADJCALLSTACKUP) {}
+BPFInstrInfo::BPFInstrInfo(const BPFSubtarget &STI)
+    : BPFGenInstrInfo(STI, BPF::ADJCALLSTACKDOWN, BPF::ADJCALLSTACKUP) {}
 
 void BPFInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator I,
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.h b/llvm/lib/Target/BPF/BPFInstrInfo.h
index d8bbad44e314..2359e43e483f 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.h
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.h
@@ -20,12 +20,13 @@
 #include "BPFGenInstrInfo.inc"
 
 namespace llvm {
+class BPFSubtarget;
 
 class BPFInstrInfo : public BPFGenInstrInfo {
   const BPFRegisterInfo RI;
 
 public:
-  BPFInstrInfo();
+  explicit BPFInstrInfo(const BPFSubtarget &STI);
 
   const BPFRegisterInfo &getRegisterInfo() const { return RI; }
 
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.td b/llvm/lib/Target/BPF/BPFInstrInfo.td
index b21f1a0eee3b..de7dae2c8ca6 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.td
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.td
@@ -1189,10 +1189,9 @@ let Defs = [R0, R1, R2, R3, R4, R5], Uses = [R6], hasSideEffects = 1,
     hasExtraDefRegAllocReq = 1, hasExtraSrcRegAllocReq = 1, mayLoad = 1 in {
 class LOAD_ABS<BPFWidthModifer SizeOp, string OpcodeStr, Intrinsic OpNode>
     : TYPE_LD_ST<BPF_ABS.Value, SizeOp.Value,
-                 (outs),
-                 (ins GPR:$skb, i64imm:$imm),
+                 (outs), (ins i64imm:$imm),
                  "r0 = *("#OpcodeStr#" *)skb[$imm]",
-                 [(set R0, (OpNode GPR:$skb, i64immSExt32:$imm))]> {
+                 [(set R0, (OpNode R6, i64immSExt32:$imm))]> {
   bits<32> imm;
 
   let Inst{31-0} = imm;
@@ -1201,10 +1200,9 @@ class LOAD_ABS<BPFWidthModifer SizeOp, string OpcodeStr, Intrinsic OpNode>
 
 class LOAD_IND<BPFWidthModifer SizeOp, string OpcodeStr, Intrinsic OpNode>
     : TYPE_LD_ST<BPF_IND.Value, SizeOp.Value,
-                 (outs),
-                 (ins GPR:$skb, GPR:$val),
+                 (outs), (ins GPR:$val),
                  "r0 = *("#OpcodeStr#" *)skb[$val]",
-                 [(set R0, (OpNode GPR:$skb, GPR:$val))]> {
+                 [(set R0, (OpNode R6, GPR:$val))]> {
   bits<4> val;
 
   let Inst{55-52} = val;
diff --git a/llvm/lib/Target/BPF/BPFSubtarget.cpp b/llvm/lib/Target/BPF/BPFSubtarget.cpp
index 4167547680b1..a7ecc39fad7b 100644
--- a/llvm/lib/Target/BPF/BPFSubtarget.cpp
+++ b/llvm/lib/Target/BPF/BPFSubtarget.cpp
@@ -103,7 +103,7 @@ void BPFSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
 BPFSubtarget::BPFSubtarget(const Triple &TT, const std::string &CPU,
                            const std::string &FS, const TargetMachine &TM)
     : BPFGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS),
-      FrameLowering(initializeSubtargetDependencies(CPU, FS)),
+      InstrInfo(initializeSubtargetDependencies(CPU, FS)), FrameLowering(*this),
       TLInfo(TM, *this) {
   IsLittleEndian = TT.isLittleEndian();
 
diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp
index bed6bc98b167..ba4b48990c64 100644
--- a/llvm/lib/Target/BPF/BTFDebug.cpp
+++ b/llvm/lib/Target/BPF/BTFDebug.cpp
@@ -235,7 +235,7 @@ void BTFTypeEnum64::completeType(BTFDebug &BDebug) {
     BTFEnum.NameOff = BDebug.addString(Enum->getName());
     uint64_t Value;
     if (Enum->isUnsigned())
-      Value = static_cast<uint64_t>(Enum->getValue().getZExtValue());
+      Value = Enum->getValue().getZExtValue();
     else
       Value = static_cast<uint64_t>(Enum->getValue().getSExtValue());
     BTFEnum.Val_Lo32 = Value;
diff --git a/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp b/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
index b5bb1c08c564..230cf3b0ddbe 100644
--- a/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
+++ b/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
@@ -205,18 +205,6 @@ DecodeStatus BPFDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
     Op.setImm(Make_64(Hi, Op.getImm()));
     break;
   }
-  case BPF::LD_ABS_B:
-  case BPF::LD_ABS_H:
-  case BPF::LD_ABS_W:
-  case BPF::LD_IND_B:
-  case BPF::LD_IND_H:
-  case BPF::LD_IND_W: {
-    auto Op = Instr.getOperand(0);
-    Instr.clear();
-    Instr.addOperand(MCOperand::createReg(BPF::R6));
-    Instr.addOperand(Op);
-    break;
-  }
   }
 
   return Result;
diff --git a/llvm/lib/Target/CSKY/CSKYInstrFormats.td b/llvm/lib/Target/CSKY/CSKYInstrFormats.td
index 5296d282c689..abf5cac0013d 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrFormats.td
+++ b/llvm/lib/Target/CSKY/CSKYInstrFormats.td
@@ -168,7 +168,9 @@ class I_16_RET<bits<5> sop, bits<5> pcode, string op, list<dag> pattern>
 // Instructions(3): cmpnei32, cmphsi32, cmplti32
 class I_16_X<bits<5> sop, string op, Operand operand>
     : CSKY32Inst<AddrModeNone, 0x3a, (outs CARRY:$ca),
-    (ins GPR:$rx, operand:$imm16), !strconcat(op, "\t$rx, $imm16"), []> {
+                 (ins GPR:$rx, operand:$imm16),
+                 !strconcat(op, "\t$rx, $imm16"), []> {
+  bits<0> ca;
   bits<16> imm16;
   bits<5> rx;
   let Inst{25 - 21} = sop;
@@ -263,8 +265,9 @@ class I_12_PP<bits<5> sop, bits<5> pcode, dag outs, dag ins, string op>
 class I_5_ZX<bits<6> sop, bits<5> pcode, string op, ImmLeaf ImmType,
              list<dag> pattern>
     : CSKY32Inst<AddrModeNone, 0x31, (outs GPR:$rz),
-    (ins CARRY:$cond, GPR:$false, GPR:$rx, ImmType:$imm5),
-    !strconcat(op, "\t$rz, $rx, $imm5"), pattern> {
+                 (ins CARRY:$cond, GPR:$false, GPR:$rx, ImmType:$imm5),
+                 !strconcat(op, "\t$rz, $rx, $imm5"), pattern> {
+  bits<0> cond;
   bits<5> rz;
   bits<5> rx;
   bits<5> imm5;
@@ -469,9 +472,10 @@ class I_5_XZ_UZ<bits<6> sop, bits<5> lsb, bits<5> msb, string op, int v>
 // Instructions(1): btsti32
 class I_5_X<bits<6> sop, bits<5> pcode, string op, ImmLeaf ImmType,
             list<dag> pattern>
-    : CSKY32Inst<AddrModeNone, 0x31,
-    (outs CARRY:$ca), (ins GPR:$rx, ImmType:$imm5),
-    !strconcat(op, "\t$rx, $imm5"), pattern> {
+    : CSKY32Inst<AddrModeNone, 0x31, (outs CARRY:$ca),
+                 (ins GPR:$rx, ImmType:$imm5),
+                 !strconcat(op, "\t$rx, $imm5"), pattern> {
+  bits<0> ca;
   bits<5> imm5;
   bits<5> rx;
   let Inst{25 - 21} = imm5;
@@ -581,9 +585,9 @@ class R_XXZ<bits<6> sop, bits<5> pcode, dag outs, dag ins, string op,
 // Format< OP[6] | RY[5] | RX[5] | SOP[6] | PCODE[5] | 00000[5] >
 // Instructions:(4) cmpne32, cmphs32, cmplt32, tst32
 class R_YX<bits<6> sop, bits<5> pcode, string op>
-    : CSKY32Inst<AddrModeNone, 0x31, (outs CARRY:$ca),
-                 (ins GPR:$rx, GPR:$ry),
+    : CSKY32Inst<AddrModeNone, 0x31, (outs CARRY:$ca), (ins GPR:$rx, GPR:$ry),
                  !strconcat(op, "\t$rx, $ry"), []> {
+  bits<0> ca;
   bits<5> ry;
   bits<5> rx;
   let Inst{25 - 21} = ry;
@@ -642,8 +646,9 @@ class R_X<bits<6> sop, bits<5> pcode, dag outs, dag ins, string op, list<dag> pa
 // Format< OP[6] | 00000[5] | 00000[5] | SOP[6] | PCODE[5] | RZ[5] >
 // Instructions:(2) mvc32, mvcv32
 class R_Z_1<bits<6> sop, bits<5> pcode, string op>
-    : CSKY32Inst<AddrModeNone, 0x31, (outs GPR:$rz),
-                 (ins CARRY:$ca), !strconcat(op, "\t$rz"), []> {
+    : CSKY32Inst<AddrModeNone, 0x31, (outs GPR:$rz), (ins CARRY:$ca),
+                 !strconcat(op, "\t$rz"), []> {
+  bits<0> ca;
   bits<5> rz;
   let Inst{25 - 21} = 0;
   let Inst{20 - 16} = 0;
@@ -656,7 +661,8 @@ class R_Z_1<bits<6> sop, bits<5> pcode, string op>
 // Instructions:(2) clrf32, clrt32
 class R_Z_2<bits<6> sop, bits<5> pcode, string op>
     : CSKY32Inst<AddrModeNone, 0x31, (outs GPR:$rz),
-    (ins CARRY:$ca, GPR:$false), !strconcat(op, "\t$rz"), []> {
+                 (ins CARRY:$ca, GPR:$false), !strconcat(op, "\t$rz"), []> {
+  bits<0> ca;
   bits<5> rz;
   let Inst{25 - 21} = rz;
   let Inst{20 - 16} = 0;
diff --git a/llvm/lib/Target/CSKY/CSKYInstrFormats16Instr.td b/llvm/lib/Target/CSKY/CSKYInstrFormats16Instr.td
index ea0761d97545..5cd970d27d47 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrFormats16Instr.td
+++ b/llvm/lib/Target/CSKY/CSKYInstrFormats16Instr.td
@@ -16,8 +16,9 @@ class J16<bits<5> sop, string opstr, dag ins>
 }
 
 class J16_B<bits<5> sop, string opstr>
-  : CSKY16Inst<AddrModeNone, (outs), (ins CARRY:$ca, br_symbol_16bit:$offset),
-    !strconcat(opstr, "\t$offset"), []> {
+    : CSKY16Inst<AddrModeNone, (outs), (ins CARRY:$ca, br_symbol_16bit:$offset),
+                 !strconcat(opstr, "\t$offset"), []> {
+  bits<0> ca;
   bits<10> offset;
   let Inst{15} = 0;
   let Inst{14 - 10} = sop;
@@ -66,6 +67,8 @@ class R16_XZ_BINOP_NOPat<bits<4> op, bits<2> sop, string opstr> : CSKY16Inst<
 class R16_XZ_BINOP_C<bits<4> op, bits<2> sop, string opstr> : CSKY16Inst<
   AddrModeNone, (outs sGPR:$rz, CARRY:$cout),
   (ins sGPR:$rZ, sGPR:$rx, CARRY:$cin), !strconcat(opstr, "\t$rz, $rx"), []> {
+  bits<0> cout;
+  bits<0> cin;
   bits<4> rz;
   bits<4> rx;
   let Inst{15, 14} = 0b01;
@@ -101,9 +104,10 @@ class R16_Z_UNOP<bits<4> op, bits<2> sop, string opstr> : CSKY16Inst<
   let Constraints = "$rz = $rx";
 }
 
-class R16_XY_CMP<bits<2> sop, string opstr> : CSKY16Inst<
-  AddrModeNone, (outs CARRY:$ca), (ins sGPR:$rx, sGPR:$ry), !strconcat(opstr, "\t$rx, $ry"),
-  []> {
+class R16_XY_CMP<bits<2> sop, string opstr>
+    : CSKY16Inst<AddrModeNone, (outs CARRY:$ca), (ins sGPR:$rx, sGPR:$ry),
+                 !strconcat(opstr, "\t$rx, $ry"), []> {
+  bits<0> ca;
   bits<4> ry;
   bits<4> rx;
   let Inst{15, 14} = 0b01;
@@ -145,9 +149,11 @@ class I16_Z_5<bits<3> sop, dag outs, dag ins,string opstr>
   let Inst{4 - 0} = imm5;
 }
 
-class I16_X_CMP<bits<3> sop, string opstr, Operand Immoperand> : CSKY16Inst<
-  AddrModeNone, (outs CARRY:$ca), (ins mGPR:$rx, Immoperand:$imm5),
-  !strconcat(opstr, "\t$rx, $imm5"), []> {
+class I16_X_CMP<bits<3> sop, string opstr, Operand Immoperand>
+    : CSKY16Inst<AddrModeNone, (outs CARRY:$ca),
+                 (ins mGPR:$rx, Immoperand:$imm5),
+                 !strconcat(opstr, "\t$rx, $imm5"), []> {
+  bits<0> ca;
   bits<3> rx;
   bits<5> imm5;
   let Inst{15, 14} = 0b00;
@@ -158,9 +164,12 @@ class I16_X_CMP<bits<3> sop, string opstr, Operand Immoperand> : CSKY16Inst<
   let isCompare = 1;
 }
 
-class I16_SP_IMM7<bits<3> sop, string opstr> : CSKY16Inst<
-  AddrModeNone, (outs GPRSP:$sp2), (ins GPRSP:$sp1, uimm7_2:$imm7),
-  !strconcat(opstr, "\t$sp2, $sp1, $imm7"), []> {
+class I16_SP_IMM7<bits<3> sop, string opstr>
+    : CSKY16Inst<AddrModeNone, (outs GPRSP:$sp2),
+                 (ins GPRSP:$sp1, uimm7_2:$imm7),
+                 !strconcat(opstr, "\t$sp2, $sp1, $imm7"), []> {
+  bits<0> sp2;
+  bits<0> sp1;
   bits<7> imm7;
   let Inst{15, 14} = 0b00;
   let Inst{13 - 10} = 0b0101;
diff --git a/llvm/lib/Target/CSKY/CSKYInstrFormatsF1.td b/llvm/lib/Target/CSKY/CSKYInstrFormatsF1.td
index 446670a4d0a9..a40874b054d8 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrFormatsF1.td
+++ b/llvm/lib/Target/CSKY/CSKYInstrFormatsF1.td
@@ -91,15 +91,21 @@ multiclass FT_XZ<bits<6> sop, string op, PatFrag opnode> {
 }
 
 let vrz = 0, isCompare = 1 in {
-class F_CMPXY<bits<5> datatype, bits<6> sop, string op, string op_su, RegisterOperand regtype>
-  : F_XYZ_BASE<datatype, sop, (outs CARRY:$ca), (ins regtype:$vrx, regtype:$vry), !strconcat(op#op_su, "\t$vrx, $vry"),
-  []>;
-
-let vry = 0 in{
-class F_CMPZX<bits<5> datatype, bits<6> sop, string op, string op_su, RegisterOperand regtype>
-  : F_XYZ_BASE<datatype, sop, (outs CARRY:$ca), (ins regtype:$vrx), !strconcat(op#op_su, "\t$vrx"),
-  []>;
-}
+  class F_CMPXY<bits<5> datatype, bits<6> sop, string op, string op_su,
+                RegisterOperand regtype>
+      : F_XYZ_BASE<datatype, sop, (outs CARRY:$ca),
+                   (ins regtype:$vrx, regtype:$vry),
+                   !strconcat(op#op_su, "\t$vrx, $vry"), []> {
+    bits<0> ca;
+  }
+
+  let vry = 0 in
+  class F_CMPZX<bits<5> datatype, bits<6> sop, string op, string op_su,
+                RegisterOperand regtype>
+      : F_XYZ_BASE<datatype, sop, (outs CARRY:$ca), (ins regtype:$vrx),
+                   !strconcat(op#op_su, "\t$vrx"), []> {
+    bits<0> ca;
+  }
 }
 
 class F_XYZ<bits<5> datatype, bits<6> sop, string op, string op_su, PatFrag opnode, RegisterOperand regtype>
diff --git a/llvm/lib/Target/CSKY/CSKYInstrFormatsF2.td b/llvm/lib/Target/CSKY/CSKYInstrFormatsF2.td
index 641ad623f140..bd7c554565cd 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrFormatsF2.td
+++ b/llvm/lib/Target/CSKY/CSKYInstrFormatsF2.td
@@ -91,8 +91,9 @@ multiclass F2_XZ_SET_T<bits<6> sop, string op, string suffix = ""> {
 let vrz = 0, isCompare = 1 in
 class F2_CXY<bits<5> datatype, RegisterOperand regtype, bits<6> sop, string op>
     : F2_XYZ<datatype, sop, !strconcat(op, "\t$vrx, $vry"),
-             (outs CARRY:$ca), (ins regtype:$vrx, regtype:$vry),
-             []>;
+             (outs CARRY:$ca), (ins regtype:$vrx, regtype:$vry), []> {
+  bits<0> ca;
+}
 
 multiclass F2_CXY_T<bits<6> sop, string op> {
   def _S : F2_CXY<0b00000, FPR32Op, sop, op#".32">;
@@ -103,9 +104,10 @@ multiclass F2_CXY_T<bits<6> sop, string op> {
 
 let vrz = 0, vry = 0, isCompare = 1 in
 class F2_CX<bits<5> datatype, RegisterOperand regtype, bits<6> sop, string op>
-    : F2_XYZ<datatype, sop, !strconcat(op, "\t$vrx"),
-             (outs CARRY:$ca), (ins regtype:$vrx),
-             []>;
+    : F2_XYZ<datatype, sop, !strconcat(op, "\t$vrx"), (outs CARRY:$ca),
+             (ins regtype:$vrx), []> {
+  bits<0> ca;
+}
 
 multiclass F2_CX_T<bits<6> sop, string op> {
   def _S : F2_CX<0b00000, FPR32Op, sop, op#".32">;
@@ -183,7 +185,10 @@ class F2_LDSTR_D<bits<1> sop, string op, dag outs, dag ins>
 class F2_CXYZ<bits<5> datatype, RegisterOperand regtype, bits<6> sop, string op>
     : F2_XYZ<datatype, sop, !strconcat(op, "\t$vrz, $vrx, $vry"),
              (outs regtype:$vrz), (ins CARRY:$ca, regtype:$vrx, regtype:$vry),
-             []>;
+             []> {
+  bits<0> ca;
+}
+
 multiclass F2_CXYZ_T<bits<6> sop, string op> {
   def _S : F2_CXYZ<0b00000, FPR32Op, sop, op#".32">;
   let Predicates = [HasFPUv3_DF] in
diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp
index ccb3f16394d4..619a797be6dc 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp
+++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp
@@ -24,8 +24,9 @@ using namespace llvm;
 #define GET_INSTRINFO_CTOR_DTOR
 #include "CSKYGenInstrInfo.inc"
 
-CSKYInstrInfo::CSKYInstrInfo(CSKYSubtarget &STI)
-    : CSKYGenInstrInfo(CSKY::ADJCALLSTACKDOWN, CSKY::ADJCALLSTACKUP), STI(STI) {
+CSKYInstrInfo::CSKYInstrInfo(const CSKYSubtarget &STI)
+    : CSKYGenInstrInfo(STI, CSKY::ADJCALLSTACKDOWN, CSKY::ADJCALLSTACKUP),
+      STI(STI) {
   v2sf = STI.hasFPUv2SingleFloat();
   v2df = STI.hasFPUv2DoubleFloat();
   v3sf = STI.hasFPUv3SingleFloat();
diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.h b/llvm/lib/Target/CSKY/CSKYInstrInfo.h
index 98f583e8b405..6451c0af14fc 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrInfo.h
+++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.h
@@ -33,7 +33,7 @@ protected:
   const CSKYSubtarget &STI;
 
 public:
-  explicit CSKYInstrInfo(CSKYSubtarget &STI);
+  explicit CSKYInstrInfo(const CSKYSubtarget &STI);
 
   Register isLoadFromStackSlot(const MachineInstr &MI,
                                int &FrameIndex) const override;
diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.td b/llvm/lib/Target/CSKY/CSKYInstrInfo.td
index c6bfc2495ae2..82e271e5b556 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrInfo.td
+++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.td
@@ -586,14 +586,23 @@ let Predicates = [iHasE2] in {
     BinOpFrag<(rotl node:$LHS, (and node:$RHS, 0x1f))>, "rotl32">;
 
   def BMASKI32 : I_5_Z<0b010100, 0x1, "bmaski32", oimm5, []>;
-  def LSLC32 : I_5_XZ<0x13, 0x1, "lslc32",
-    (outs GPR:$rz, CARRY:$cout), (ins GPR:$rx, oimm5:$imm5), []>;
-  def LSRC32 : I_5_XZ<0x13, 0x2, "lsrc32",
-    (outs GPR:$rz, CARRY:$cout), (ins GPR:$rx, oimm5:$imm5), []>;
-  def ASRC32 : I_5_XZ<0x13, 0x4, "asrc32",
-    (outs GPR:$rz, CARRY:$cout), (ins GPR:$rx, oimm5:$imm5), []>;
-  def XSR32 : I_5_XZ<0x13, 0x8, "xsr32",
-    (outs GPR:$rz, CARRY:$cout), (ins GPR:$rx, oimm5:$imm5, CARRY:$cin), []>;
+  def LSLC32 : I_5_XZ<0x13, 0x1, "lslc32", (outs GPR:$rz, CARRY:$cout),
+                      (ins GPR:$rx, oimm5:$imm5), []> {
+    bits<0> cout;
+  }
+  def LSRC32 : I_5_XZ<0x13, 0x2, "lsrc32", (outs GPR:$rz, CARRY:$cout),
+                      (ins GPR:$rx, oimm5:$imm5), []> {
+    bits<0> cout;
+  }
+  def ASRC32 : I_5_XZ<0x13, 0x4, "asrc32", (outs GPR:$rz, CARRY:$cout),
+                      (ins GPR:$rx, oimm5:$imm5), []> {
+    bits<0> cout;
+  }
+  def XSR32 : I_5_XZ<0x13, 0x8, "xsr32", (outs GPR:$rz, CARRY:$cout),
+                     (ins GPR:$rx, oimm5:$imm5, CARRY:$cin), []> {
+    bits<0> cout;
+    bits<0> cin;
+  }
 
   def IXH32 : R_YXZ_SP_F1<0x2, 0x1,
     BinOpFrag<(add node:$LHS, (shl node:$RHS, (i32 1)))>, "ixh32">;
@@ -605,9 +614,15 @@ let Predicates = [iHasE2] in {
 
   let isCommutable = 1, isAdd = 1 in
   def ADDC32 : R_YXZ<0x31, 0x0, 0x2, (outs GPR:$rz, CARRY:$cout),
-    (ins GPR:$rx, GPR:$ry, CARRY:$cin), "addc32", []>;
+                     (ins GPR:$rx, GPR:$ry, CARRY:$cin), "addc32", []> {
+    bits<0> cout;
+    bits<0> cin;
+  }
   def SUBC32 : R_YXZ<0x31, 0x0, 0x8, (outs GPR:$rz, CARRY:$cout),
-    (ins GPR:$rx, GPR:$ry, CARRY:$cin), "subc32", []>;
+                     (ins GPR:$rx, GPR:$ry, CARRY:$cin), "subc32", []> {
+    bits<0> cout;
+    bits<0> cin;
+  }
 
   def INCF32 : I_5_ZX<0x3, 0x1, "incf32", uimm5, []>;
   def INCT32 : I_5_ZX<0x3, 0x2, "inct32", uimm5, []>;
@@ -621,12 +636,18 @@ let Predicates = [iHas2E3] in {
   def DIVU32 : R_YXZ_SP_F1<0x20, 0x1,
     BinOpFrag<(udiv node:$LHS, node:$RHS)>, "divu32">;
 
-  def DECGT32 : I_5_XZ<0x4, 0x1, "decgt32",
-    (outs GPR:$rz, CARRY:$cout), (ins GPR:$rx, uimm5:$imm5), []>;
-  def DECLT32 : I_5_XZ<0x4, 0x2, "declt32",
-    (outs GPR:$rz, CARRY:$cout), (ins GPR:$rx, uimm5:$imm5), []>;
-  def DECNE32 : I_5_XZ<0x4, 0x4, "decne32",
-    (outs GPR:$rz, CARRY:$cout), (ins GPR:$rx, uimm5:$imm5), []>;
+  def DECGT32 : I_5_XZ<0x4, 0x1, "decgt32", (outs GPR:$rz, CARRY:$cout),
+                       (ins GPR:$rx, uimm5:$imm5), []> {
+    bits<0> cout;
+  }
+  def DECLT32 : I_5_XZ<0x4, 0x2, "declt32", (outs GPR:$rz, CARRY:$cout),
+                       (ins GPR:$rx, uimm5:$imm5), []> {
+    bits<0> cout;
+  }
+  def DECNE32 : I_5_XZ<0x4, 0x4, "decne32", (outs GPR:$rz, CARRY:$cout),
+                       (ins GPR:$rx, uimm5:$imm5), []> {
+    bits<0> cout;
+  }
 
   def SEXT32 : I_5_XZ_U<0x16, (outs GPR:$rz), (ins GPR:$rx, uimm5:$msb, uimm5:$lsb), "sext32", []>;
   let isCodeGenOnly = 1 in {
@@ -744,8 +765,9 @@ let Predicates = [iHas2E3] in {
   def CMPHS32 : R_YX<0x1, 0x1, "cmphs32">;
   def CMPLT32 : R_YX<0x1, 0x2, "cmplt32">;
 
-  def SETC32 : CSKY32Inst<AddrModeNone, 0x31,
-    (outs CARRY:$ca), (ins), "setc32", []> {
+  def SETC32 : CSKY32Inst<AddrModeNone, 0x31, (outs CARRY:$ca), (ins), "setc32",
+                          []> {
+    bits<0> ca;
     let Inst{25 - 21} = 0; //rx
     let Inst{20 - 16} = 0; //ry
     let Inst{15 - 10} = 0x1;
@@ -753,8 +775,9 @@ let Predicates = [iHas2E3] in {
     let Inst{4 - 0} = 0;
     let isCompare = 1;
   }
-  def CLRC32 : CSKY32Inst<AddrModeNone, 0x31,
-    (outs CARRY:$ca), (ins), "clrc32", []> {
+  def CLRC32 : CSKY32Inst<AddrModeNone, 0x31, (outs CARRY:$ca), (ins), "clrc32",
+                          []> {
+    bits<0> ca;
     let Inst{25 - 21} = 0; //rx
     let Inst{20 - 16} = 0; //ry
     let Inst{15 - 10} = 0x1;
@@ -764,8 +787,10 @@ let Predicates = [iHas2E3] in {
   }
 
   def TST32 : R_YX<0x8, 0x4, "tst32">;
-  def TSTNBZ32 : R_X<0x8, 0x8,
-    (outs CARRY:$ca), (ins GPR:$rx), "tstnbz32", []>;
+  def TSTNBZ32 : R_X<0x8, 0x8, (outs CARRY:$ca), (ins GPR:$rx), "tstnbz32",
+                     []> {
+    bits<0> ca;
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -806,9 +831,14 @@ let isBranch = 1, isTerminator = 1 in {
                      [(br bb:$imm16)]>;
 
   def BT32 : I_16_L<0x3, (outs), (ins CARRY:$ca, br_symbol:$imm16),
-    "bt32\t$imm16", [(brcond CARRY:$ca, bb:$imm16)]>, Requires<[iHasE2]>;
+                    "bt32\t$imm16", [(brcond CARRY:$ca, bb:$imm16)]>,
+             Requires<[iHasE2]> {
+    bits<0> ca;
+  }
   def BF32 : I_16_L<0x2, (outs), (ins CARRY:$ca, br_symbol:$imm16),
-    "bf32\t$imm16", []>, Requires<[iHasE2]>;
+                    "bf32\t$imm16", []>, Requires<[iHasE2]> {
+    bits<0> ca;
+  }
 }
 
 let Predicates = [iHas2E3] in {
@@ -1030,7 +1060,10 @@ def SE32 : I_5_XZ_PRIVI<0b010110, 0x1, "se32">;
 def WSC32 : I_5_XZ_PRIVI<0b001111, 0x1, "wsc32">;
 
 def CPOP32 : I_CPOP<(outs), (ins uimm5:$cpid, uimm20:$usdef), "cpop32 <$cpid, ${usdef}>">;
-def CPRC32 : I_CP<0b0100, (outs CARRY:$ca), (ins uimm5:$cpid, uimm12:$usdef), "cprc32 <$cpid, ${usdef}>">;
+def CPRC32 : I_CP<0b0100, (outs CARRY:$ca), (ins uimm5:$cpid, uimm12:$usdef),
+                  "cprc32 <$cpid, ${usdef}>"> {
+  bits<0> ca;
+}
 def CPRCR32 : I_CP_Z<0b0010, (outs GPR:$rz), (ins uimm5:$cpid, uimm12:$usdef), "cprcr32 $rz, <$cpid, ${usdef}>">;
 def CPRGR32 : I_CP_Z<0b0000, (outs GPR:$rz), (ins uimm5:$cpid, uimm12:$usdef), "cprgr32 $rz, <$cpid, ${usdef}>">;
 def CPWCR32 : I_CP_Z<0b0011, (outs), (ins GPR:$rz, uimm5:$cpid, uimm12:$usdef), "cpwcr32 $rz, <$cpid, ${usdef}>">;
diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td b/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td
index 3e248019d73f..51645215f32a 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td
+++ b/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td
@@ -102,7 +102,9 @@ def : Pat<(add GPR:$rs1, (oimm8_neg:$im)),
 
 let isAdd = 1 in
 def ADDI16ZSP : I16_Z_8<0b011, (ins GPRSP:$sp, uimm8_2:$imm8),
-                        "addi16\t$rz, $sp, $imm8">;
+                        "addi16\t$rz, $sp, $imm8"> {
+  bits<0> sp;
+}
 
 let isAdd = 1 in
 def ADDI16SPSP : I16_SP_IMM7<0b000,"addi16">;
@@ -142,10 +144,14 @@ def ST16H : I16_XZ_LDST<AddrMode16H, 0b101, "st16.h",
 def ST16W : I16_XZ_LDST<AddrMode16W, 0b110, "st16.w",
   (outs), (ins mGPR:$rz, mGPR:$rx, uimm5_2:$imm)>;
 
-def LD16WSP : I16_ZSP_LDST<AddrMode16W, 0b011, "ld16.w",
-  (outs mGPR:$rz), (ins GPRSP:$sp, uimm8_2:$addr)>;
-def ST16WSP : I16_ZSP_LDST<AddrMode16W, 0b111, "st16.w",
-  (outs), (ins mGPR:$rz, GPRSP:$sp, uimm8_2:$addr)>;
+def LD16WSP : I16_ZSP_LDST<AddrMode16W, 0b011, "ld16.w", (outs mGPR:$rz),
+                           (ins GPRSP:$sp, uimm8_2:$addr)> {
+  bits<0> sp;
+}
+def ST16WSP : I16_ZSP_LDST<AddrMode16W, 0b111, "st16.w", (outs),
+                           (ins mGPR:$rz, GPRSP:$sp, uimm8_2:$addr)> {
+  bits<0> sp;
+}
 
 //===----------------------------------------------------------------------===//
 // Compare instructions.
@@ -187,8 +193,9 @@ def MOV16 : CSKY16Inst<AddrModeNone, (outs sGPR:$rz), (ins sGPR:$rx),
 }
 
 // MVC16 is not in "cskyv2 instructions reference manul"
-def MVCV16 : CSKY16Inst<AddrModeNone,
-  (outs sGPR:$rz), (ins CARRY:$ca), "mvcv16\t$rz", []> {
+def MVCV16 : CSKY16Inst<AddrModeNone, (outs sGPR:$rz), (ins CARRY:$ca),
+                        "mvcv16\t$rz", []> {
+  bits<0> ca;
   bits<4> rz;
   let Inst{15,14} = 0b01;
   let Inst{13 - 10} = 0b1001;
@@ -317,11 +324,14 @@ let Constraints = "$rZ = $rz" in {
 }
 
 let Predicates = [HasBTST16] in
-  def BTSTI16 : I16_Z_5<0b110, (outs CARRY:$ca), (ins mGPR:$rz, uimm5:$imm5),
-                        "btsti16">;
+def BTSTI16 : I16_Z_5<0b110, (outs CARRY:$ca), (ins mGPR:$rz, uimm5:$imm5),
+                      "btsti16"> {
+  bits<0> ca;
+}
 
 def TST16 : CSKY16Inst<AddrModeNone, (outs CARRY:$ca), (ins sGPR:$rx, sGPR:$ry),
-                        "tst16\t$rx, $ry", []> {
+                       "tst16\t$rx, $ry", []> {
+  bits<0> ca;
   bits<4> ry;
   bits<4> rx;
   let Inst{15,14} = 0b01;
@@ -334,6 +344,7 @@ def TST16 : CSKY16Inst<AddrModeNone, (outs CARRY:$ca), (ins sGPR:$rx, sGPR:$ry),
 
 def TSTNBZ16 : CSKY16Inst<AddrModeNone, (outs CARRY:$ca), (ins sGPR:$rx),
                           "tstnbz16\t$rx", []> {
+  bits<0> ca;
   bits<4> rx;
   let Inst{15,14} = 0b01;
   let Inst{13 - 10} = 0b1010;
diff --git a/llvm/lib/Target/CSKY/Disassembler/CSKYDisassembler.cpp b/llvm/lib/Target/CSKY/Disassembler/CSKYDisassembler.cpp
index 749127f4ddc8..887e28127953 100644
--- a/llvm/lib/Target/CSKY/Disassembler/CSKYDisassembler.cpp
+++ b/llvm/lib/Target/CSKY/Disassembler/CSKYDisassembler.cpp
@@ -36,8 +36,6 @@ class CSKYDisassembler : public MCDisassembler {
   std::unique_ptr<MCInstrInfo const> const MCII;
   mutable StringRef symbolName;
 
-  DecodeStatus handleCROperand(MCInst &Instr) const;
-
 public:
   CSKYDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
                    MCInstrInfo const *MCII);
@@ -198,15 +196,9 @@ static DecodeStatus DecodemGPRRegisterClass(MCInst &Inst, uint64_t RegNo,
   return MCDisassembler::Success;
 }
 
-// TODO
-LLVM_ATTRIBUTE_UNUSED
-static DecodeStatus DecodeGPRSPRegisterClass(MCInst &Inst, uint64_t RegNo,
-                                             uint64_t Address,
+static DecodeStatus DecodeGPRSPRegisterClass(MCInst &Inst,
                                              const MCDisassembler *Decoder) {
-  if (RegNo != 14)
-    return MCDisassembler::Fail;
-
-  Inst.addOperand(MCOperand::createReg(GPRDecoderTable[RegNo]));
+  Inst.addOperand(MCOperand::createReg(CSKY::R14));
   return MCDisassembler::Success;
 }
 
@@ -224,6 +216,12 @@ static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, uint64_t RegNo,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeCARRYRegisterClass(MCInst &Inst,
+                                             const MCDisassembler *Decoder) {
+  Inst.addOperand(MCOperand::createReg(CSKY::C));
+  return MCDisassembler::Success;
+}
+
 template <unsigned N, unsigned S>
 static DecodeStatus decodeUImmOperand(MCInst &Inst, uint64_t Imm,
                                       int64_t Address,
@@ -378,121 +376,6 @@ static DecodeStatus decodeSImmOperand(MCInst &Inst, uint64_t Imm,
 
 #include "CSKYGenDisassemblerTables.inc"
 
-DecodeStatus CSKYDisassembler::handleCROperand(MCInst &MI) const {
-
-  // FIXME: To query instruction info from td file or a table inc file
-  switch (MI.getOpcode()) {
-  default:
-    return MCDisassembler::Success;
-  case CSKY::LD16WSP:
-  case CSKY::ST16WSP:
-  case CSKY::ADDI16ZSP:
-    MI.insert(std::next(MI.begin()), MCOperand::createReg(CSKY::R14));
-    return MCDisassembler::Success;
-  case CSKY::ADDI16SPSP:
-  case CSKY::SUBI16SPSP:
-    MI.insert(MI.begin(), MCOperand::createReg(CSKY::R14));
-    MI.insert(MI.begin(), MCOperand::createReg(CSKY::R14));
-    return MCDisassembler::Success;
-  case CSKY::FCMPHS_S:
-  case CSKY::FCMPHS_D:
-  case CSKY::FCMPLT_S:
-  case CSKY::FCMPLT_D:
-  case CSKY::FCMPNE_S:
-  case CSKY::FCMPNE_D:
-  case CSKY::FCMPUO_S:
-  case CSKY::FCMPUO_D:
-  case CSKY::FCMPZHS_S:
-  case CSKY::FCMPZHS_D:
-  case CSKY::FCMPZLS_S:
-  case CSKY::FCMPZLS_D:
-  case CSKY::FCMPZNE_S:
-  case CSKY::FCMPZNE_D:
-  case CSKY::FCMPZUO_S:
-  case CSKY::FCMPZUO_D:
-  case CSKY::f2FCMPHS_S:
-  case CSKY::f2FCMPHS_D:
-  case CSKY::f2FCMPLT_S:
-  case CSKY::f2FCMPLT_D:
-  case CSKY::f2FCMPNE_S:
-  case CSKY::f2FCMPNE_D:
-  case CSKY::f2FCMPUO_S:
-  case CSKY::f2FCMPUO_D:
-  case CSKY::f2FCMPHSZ_S:
-  case CSKY::f2FCMPHSZ_D:
-  case CSKY::f2FCMPHZ_S:
-  case CSKY::f2FCMPHZ_D:
-  case CSKY::f2FCMPLSZ_S:
-  case CSKY::f2FCMPLSZ_D:
-  case CSKY::f2FCMPLTZ_S:
-  case CSKY::f2FCMPLTZ_D:
-  case CSKY::f2FCMPNEZ_S:
-  case CSKY::f2FCMPNEZ_D:
-  case CSKY::f2FCMPUOZ_S:
-  case CSKY::f2FCMPUOZ_D:
-
-  case CSKY::BT32:
-  case CSKY::BF32:
-  case CSKY::BT16:
-  case CSKY::BF16:
-  case CSKY::CMPNEI32:
-  case CSKY::CMPNEI16:
-  case CSKY::CMPNE32:
-  case CSKY::CMPNE16:
-  case CSKY::CMPHSI32:
-  case CSKY::CMPHSI16:
-  case CSKY::CMPHS32:
-  case CSKY::CMPHS16:
-  case CSKY::CMPLTI32:
-  case CSKY::CMPLTI16:
-  case CSKY::CMPLT32:
-  case CSKY::CMPLT16:
-  case CSKY::BTSTI32:
-  case CSKY::BTSTI16:
-  case CSKY::TSTNBZ32:
-  case CSKY::TSTNBZ16:
-  case CSKY::TST32:
-  case CSKY::TST16:
-    MI.insert(MI.begin(), MCOperand::createReg(CSKY::C));
-    return MCDisassembler::Success;
-  case CSKY::LSLC32:
-  case CSKY::LSRC32:
-  case CSKY::ASRC32:
-    MI.insert(std::next(MI.begin()), MCOperand::createReg(CSKY::C));
-    return MCDisassembler::Success;
-  case CSKY::MOVF32:
-  case CSKY::MOVT32:
-  case CSKY::MVC32:
-  case CSKY::MVCV32:
-  case CSKY::MVCV16:
-  case CSKY::INCT32:
-  case CSKY::INCF32:
-  case CSKY::DECT32:
-  case CSKY::DECF32:
-  case CSKY::DECGT32:
-  case CSKY::DECLT32:
-  case CSKY::DECNE32:
-  case CSKY::CLRF32:
-  case CSKY::CLRT32:
-  case CSKY::f2FSEL_S:
-  case CSKY::f2FSEL_D:
-    MI.insert(std::next(MI.begin()), MCOperand::createReg(CSKY::C));
-    return MCDisassembler::Success;
-  case CSKY::ADDC32:
-  case CSKY::ADDC16:
-  case CSKY::SUBC32:
-  case CSKY::SUBC16:
-  case CSKY::XSR32:
-    MI.insert(std::next(MI.begin()), MCOperand::createReg(CSKY::C));
-    MI.insert(MI.end(), MCOperand::createReg(CSKY::C));
-    return MCDisassembler::Success;
-  case CSKY::INS32:
-    MI.getOperand(3).setImm(MI.getOperand(3).getImm() +
-                            MI.getOperand(4).getImm());
-    return MCDisassembler::Success;
-  }
-}
-
 static bool decodeFPUV3Instruction(MCInst &MI, uint32_t insn, uint64_t Address,
                                    const MCDisassembler *DisAsm,
                                    const MCSubtargetInfo &STI) {
@@ -548,7 +431,10 @@ DecodeStatus CSKYDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     Size = 2;
   }
 
-  handleCROperand(MI);
+  if (MI.getOpcode() == CSKY::INS32) {
+    MI.getOperand(3).setImm(MI.getOperand(3).getImm() +
+                            MI.getOperand(4).getImm());
+  }
 
   return Result;
 }
diff --git a/llvm/lib/Target/DirectX/CMakeLists.txt b/llvm/lib/Target/DirectX/CMakeLists.txt
index 8100f941c8d9..6c079517e22d 100644
--- a/llvm/lib/Target/DirectX/CMakeLists.txt
+++ b/llvm/lib/Target/DirectX/CMakeLists.txt
@@ -41,6 +41,7 @@ add_llvm_target(DirectXCodeGen
   LINK_COMPONENTS
   Analysis
   AsmPrinter
+  BinaryFormat
   CodeGen
   CodeGenTypes
   Core
diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
index a1ef2578f00a..ca81d30473c0 100644
--- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
+++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
@@ -158,12 +158,15 @@ void DXContainerGlobals::addRootSignature(Module &M,
   if (MMI.ShaderProfile == llvm::Triple::Library)
     return;
 
-  assert(MMI.EntryPropertyVec.size() == 1);
-
   auto &RSA = getAnalysis<RootSignatureAnalysisWrapper>().getRSInfo();
-  const Function *EntryFunction = MMI.EntryPropertyVec[0].Entry;
-  const mcdxbc::RootSignatureDesc *RS = RSA.getDescForFunction(EntryFunction);
+  const Function *EntryFunction = nullptr;
 
+  if (MMI.ShaderProfile != llvm::Triple::RootSignature) {
+    assert(MMI.EntryPropertyVec.size() == 1);
+    EntryFunction = MMI.EntryPropertyVec[0].Entry;
+  }
+
+  const mcdxbc::RootSignatureDesc *RS = RSA.getDescForFunction(EntryFunction);
   if (!RS)
     return;
 
@@ -258,7 +261,8 @@ void DXContainerGlobals::addPipelineStateValidationInfo(
   dxil::ModuleMetadataInfo &MMI =
       getAnalysis<DXILMetadataAnalysisWrapperPass>().getModuleMetadata();
   assert(MMI.EntryPropertyVec.size() == 1 ||
-         MMI.ShaderProfile == Triple::Library);
+         MMI.ShaderProfile == Triple::Library ||
+         MMI.ShaderProfile == Triple::RootSignature);
   PSV.BaseData.ShaderStage =
       static_cast<uint8_t>(MMI.ShaderProfile - Triple::Pixel);
 
@@ -279,7 +283,8 @@ void DXContainerGlobals::addPipelineStateValidationInfo(
     break;
   }
 
-  if (MMI.ShaderProfile != Triple::Library)
+  if (MMI.ShaderProfile != Triple::Library &&
+      MMI.ShaderProfile != Triple::RootSignature)
     PSV.EntryName = MMI.EntryPropertyVec[0].Entry->getName();
 
   PSV.finalize(MMI.ShaderProfile);
diff --git a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
index feecfc0880e2..d507d71b99fc 100644
--- a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
+++ b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
@@ -343,9 +343,7 @@ bool DataScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
 
   GOp->replaceAllUsesWith(NewGEP);
 
-  if (auto *CE = dyn_cast<ConstantExpr>(GOp))
-    CE->destroyConstant();
-  else if (auto *OldGEPI = dyn_cast<GetElementPtrInst>(GOp))
+  if (auto *OldGEPI = dyn_cast<GetElementPtrInst>(GOp))
     OldGEPI->eraseFromParent();
 
   return true;
diff --git a/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp b/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp
index 13e3408815bb..aa16e795dc76 100644
--- a/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp
+++ b/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp
@@ -22,11 +22,13 @@ static bool finalizeLinkage(Module &M) {
 
   // Convert private globals and external globals with no usage to internal
   // linkage.
-  for (GlobalVariable &GV : M.globals())
+  for (GlobalVariable &GV : M.globals()) {
+    GV.removeDeadConstantUsers();
     if (GV.hasPrivateLinkage() || (GV.hasExternalLinkage() && GV.use_empty())) {
       GV.setLinkage(GlobalValue::InternalLinkage);
       MadeChange = true;
     }
+  }
 
   SmallVector<Function *> Funcs;
 
diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
index ee1db54446cb..e2469d8df957 100644
--- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
+++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
@@ -51,6 +51,150 @@ static bool resourceAccessNeeds64BitExpansion(Module *M, Type *OverloadTy,
   return ScalarTy->isDoubleTy() || ScalarTy->isIntegerTy(64);
 }
 
+static Value *expand16BitIsInf(CallInst *Orig) {
+  Module *M = Orig->getModule();
+  if (M->getTargetTriple().getDXILVersion() >= VersionTuple(1, 9))
+    return nullptr;
+
+  Value *Val = Orig->getOperand(0);
+  Type *ValTy = Val->getType();
+  if (!ValTy->getScalarType()->isHalfTy())
+    return nullptr;
+
+  IRBuilder<> Builder(Orig);
+  Type *IType = Type::getInt16Ty(M->getContext());
+  Constant *PosInf =
+      ValTy->isVectorTy()
+          ? ConstantVector::getSplat(
+                ElementCount::getFixed(
+                    cast<FixedVectorType>(ValTy)->getNumElements()),
+                ConstantInt::get(IType, 0x7c00))
+          : ConstantInt::get(IType, 0x7c00);
+
+  Constant *NegInf =
+      ValTy->isVectorTy()
+          ? ConstantVector::getSplat(
+                ElementCount::getFixed(
+                    cast<FixedVectorType>(ValTy)->getNumElements()),
+                ConstantInt::get(IType, 0xfc00))
+          : ConstantInt::get(IType, 0xfc00);
+
+  Value *IVal = Builder.CreateBitCast(Val, PosInf->getType());
+  Value *B1 = Builder.CreateICmpEQ(IVal, PosInf);
+  Value *B2 = Builder.CreateICmpEQ(IVal, NegInf);
+  Value *B3 = Builder.CreateOr(B1, B2);
+  return B3;
+}
+
+static Value *expand16BitIsNaN(CallInst *Orig) {
+  Module *M = Orig->getModule();
+  if (M->getTargetTriple().getDXILVersion() >= VersionTuple(1, 9))
+    return nullptr;
+
+  Value *Val = Orig->getOperand(0);
+  Type *ValTy = Val->getType();
+  if (!ValTy->getScalarType()->isHalfTy())
+    return nullptr;
+
+  IRBuilder<> Builder(Orig);
+  Type *IType = Type::getInt16Ty(M->getContext());
+
+  Constant *ExpBitMask =
+      ValTy->isVectorTy()
+          ? ConstantVector::getSplat(
+                ElementCount::getFixed(
+                    cast<FixedVectorType>(ValTy)->getNumElements()),
+                ConstantInt::get(IType, 0x7c00))
+          : ConstantInt::get(IType, 0x7c00);
+  Constant *SigBitMask =
+      ValTy->isVectorTy()
+          ? ConstantVector::getSplat(
+                ElementCount::getFixed(
+                    cast<FixedVectorType>(ValTy)->getNumElements()),
+                ConstantInt::get(IType, 0x3ff))
+          : ConstantInt::get(IType, 0x3ff);
+
+  Constant *Zero =
+      ValTy->isVectorTy()
+          ? ConstantVector::getSplat(
+                ElementCount::getFixed(
+                    cast<FixedVectorType>(ValTy)->getNumElements()),
+                ConstantInt::get(IType, 0))
+          : ConstantInt::get(IType, 0);
+
+  Value *IVal = Builder.CreateBitCast(Val, ExpBitMask->getType());
+  Value *Exp = Builder.CreateAnd(IVal, ExpBitMask);
+  Value *B1 = Builder.CreateICmpEQ(Exp, ExpBitMask);
+
+  Value *Sig = Builder.CreateAnd(IVal, SigBitMask);
+  Value *B2 = Builder.CreateICmpNE(Sig, Zero);
+  Value *B3 = Builder.CreateAnd(B1, B2);
+  return B3;
+}
+
+static Value *expand16BitIsFinite(CallInst *Orig) {
+  Module *M = Orig->getModule();
+  if (M->getTargetTriple().getDXILVersion() >= VersionTuple(1, 9))
+    return nullptr;
+
+  Value *Val = Orig->getOperand(0);
+  Type *ValTy = Val->getType();
+  if (!ValTy->getScalarType()->isHalfTy())
+    return nullptr;
+
+  IRBuilder<> Builder(Orig);
+  Type *IType = Type::getInt16Ty(M->getContext());
+
+  Constant *ExpBitMask =
+      ValTy->isVectorTy()
+          ? ConstantVector::getSplat(
+                ElementCount::getFixed(
+                    cast<FixedVectorType>(ValTy)->getNumElements()),
+                ConstantInt::get(IType, 0x7c00))
+          : ConstantInt::get(IType, 0x7c00);
+
+  Value *IVal = Builder.CreateBitCast(Val, ExpBitMask->getType());
+  Value *Exp = Builder.CreateAnd(IVal, ExpBitMask);
+  Value *B1 = Builder.CreateICmpNE(Exp, ExpBitMask);
+  return B1;
+}
+
+static Value *expand16BitIsNormal(CallInst *Orig) {
+  Module *M = Orig->getModule();
+  if (M->getTargetTriple().getDXILVersion() >= VersionTuple(1, 9))
+    return nullptr;
+
+  Value *Val = Orig->getOperand(0);
+  Type *ValTy = Val->getType();
+  if (!ValTy->getScalarType()->isHalfTy())
+    return nullptr;
+
+  IRBuilder<> Builder(Orig);
+  Type *IType = Type::getInt16Ty(M->getContext());
+
+  Constant *ExpBitMask =
+      ValTy->isVectorTy()
+          ? ConstantVector::getSplat(
+                ElementCount::getFixed(
+                    cast<FixedVectorType>(ValTy)->getNumElements()),
+                ConstantInt::get(IType, 0x7c00))
+          : ConstantInt::get(IType, 0x7c00);
+  Constant *Zero =
+      ValTy->isVectorTy()
+          ? ConstantVector::getSplat(
+                ElementCount::getFixed(
+                    cast<FixedVectorType>(ValTy)->getNumElements()),
+                ConstantInt::get(IType, 0))
+          : ConstantInt::get(IType, 0);
+
+  Value *IVal = Builder.CreateBitCast(Val, ExpBitMask->getType());
+  Value *Exp = Builder.CreateAnd(IVal, ExpBitMask);
+  Value *NotAllZeroes = Builder.CreateICmpNE(Exp, Zero);
+  Value *NotAllOnes = Builder.CreateICmpNE(Exp, ExpBitMask);
+  Value *B1 = Builder.CreateAnd(NotAllZeroes, NotAllOnes);
+  return B1;
+}
+
 static bool isIntrinsicExpansion(Function &F) {
   switch (F.getIntrinsicID()) {
   case Intrinsic::abs:
@@ -68,6 +212,7 @@ static bool isIntrinsicExpansion(Function &F) {
   case Intrinsic::dx_sclamp:
   case Intrinsic::dx_nclamp:
   case Intrinsic::dx_degrees:
+  case Intrinsic::dx_isinf:
   case Intrinsic::dx_lerp:
   case Intrinsic::dx_normalize:
   case Intrinsic::dx_fdot:
@@ -301,13 +446,16 @@ static Value *expandIsFPClass(CallInst *Orig) {
   auto *TCI = dyn_cast<ConstantInt>(T);
 
   // These FPClassTest cases have DXIL opcodes, so they will be handled in
-  // DXIL Op Lowering instead.
+  // DXIL Op Lowering instead for all non f16 cases.
   switch (TCI->getZExtValue()) {
   case FPClassTest::fcInf:
+    return expand16BitIsInf(Orig);
   case FPClassTest::fcNan:
+    return expand16BitIsNaN(Orig);
   case FPClassTest::fcNormal:
+    return expand16BitIsNormal(Orig);
   case FPClassTest::fcFinite:
-    return nullptr;
+    return expand16BitIsFinite(Orig);
   }
 
   IRBuilder<> Builder(Orig);
@@ -873,6 +1021,9 @@ static bool expandIntrinsic(Function &F, CallInst *Orig) {
   case Intrinsic::dx_degrees:
     Result = expandDegreesIntrinsic(Orig);
     break;
+  case Intrinsic::dx_isinf:
+    Result = expand16BitIsInf(Orig);
+    break;
   case Intrinsic::dx_lerp:
     Result = expandLerpIntrinsic(Orig);
     break;
diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
index bd421771e8ed..577b4624458b 100644
--- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
@@ -220,7 +220,7 @@ public:
 
     removeResourceGlobals(CI);
 
-    auto *NameGlobal = dyn_cast<llvm::GlobalVariable>(CI->getArgOperand(5));
+    auto *NameGlobal = dyn_cast<llvm::GlobalVariable>(CI->getArgOperand(4));
 
     CI->replaceAllUsesWith(Replacement);
     CI->eraseFromParent();
@@ -233,6 +233,7 @@ public:
     IRBuilder<> &IRB = OpBuilder.getIRB();
     Type *Int8Ty = IRB.getInt8Ty();
     Type *Int32Ty = IRB.getInt32Ty();
+    Type *Int1Ty = IRB.getInt1Ty();
 
     return replaceFunction(F, [&](CallInst *CI) -> Error {
       IRB.SetInsertPoint(CI);
@@ -249,10 +250,13 @@ public:
         IndexOp = IRB.CreateAdd(IndexOp,
                                 ConstantInt::get(Int32Ty, Binding.LowerBound));
 
+      // FIXME: The last argument is a NonUniform flag which needs to be set
+      // based on resource analysis.
+      // https://github.com/llvm/llvm-project/issues/155701
       std::array<Value *, 4> Args{
           ConstantInt::get(Int8Ty, llvm::to_underlying(RC)),
           ConstantInt::get(Int32Ty, Binding.RecordID), IndexOp,
-          CI->getArgOperand(4)};
+          ConstantInt::get(Int1Ty, false)};
       Expected<CallInst *> OpCall =
           OpBuilder.tryCreateOp(OpCode::CreateHandle, Args, CI->getName());
       if (Error E = OpCall.takeError())
@@ -267,6 +271,7 @@ public:
   [[nodiscard]] bool lowerToBindAndAnnotateHandle(Function &F) {
     IRBuilder<> &IRB = OpBuilder.getIRB();
     Type *Int32Ty = IRB.getInt32Ty();
+    Type *Int1Ty = IRB.getInt1Ty();
 
     return replaceFunction(F, [&](CallInst *CI) -> Error {
       IRB.SetInsertPoint(CI);
@@ -295,7 +300,11 @@ public:
                                 : Binding.LowerBound + Binding.Size - 1;
       Constant *ResBind = OpBuilder.getResBind(Binding.LowerBound, UpperBound,
                                                Binding.Space, RC);
-      std::array<Value *, 3> BindArgs{ResBind, IndexOp, CI->getArgOperand(4)};
+      // FIXME: The last argument is a NonUniform flag which needs to be set
+      // based on resource analysis.
+      // https://github.com/llvm/llvm-project/issues/155701
+      Constant *NonUniform = ConstantInt::get(Int1Ty, false);
+      std::array<Value *, 3> BindArgs{ResBind, IndexOp, NonUniform};
       Expected<CallInst *> OpBind = OpBuilder.tryCreateOp(
           OpCode::CreateHandleFromBinding, BindArgs, CI->getName());
       if (Error E = OpBind.takeError())
diff --git a/llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp b/llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp
index be2c7d1ddff3..d02f4b9f7ebc 100644
--- a/llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp
+++ b/llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp
@@ -25,21 +25,6 @@
 using namespace llvm;
 using namespace llvm::dxil;
 
-static ResourceClass toResourceClass(dxbc::DescriptorRangeType RangeType) {
-  using namespace dxbc;
-  switch (RangeType) {
-  case DescriptorRangeType::SRV:
-    return ResourceClass::SRV;
-  case DescriptorRangeType::UAV:
-    return ResourceClass::UAV;
-  case DescriptorRangeType::CBV:
-    return ResourceClass::CBuffer;
-  case DescriptorRangeType::Sampler:
-    return ResourceClass::Sampler;
-  }
-  llvm_unreachable("Unknown DescriptorRangeType");
-}
-
 static ResourceClass toResourceClass(dxbc::RootParameterType Type) {
   using namespace dxbc;
   switch (Type) {
@@ -95,7 +80,7 @@ static void reportOverlappingError(Module &M, ResourceInfo R1,
 }
 
 static void reportOverlappingBinding(Module &M, DXILResourceMap &DRM) {
-  bool ErrorFound = false;
+  [[maybe_unused]] bool ErrorFound = false;
   for (const auto &ResList :
        {DRM.srvs(), DRM.uavs(), DRM.cbuffers(), DRM.samplers()}) {
     if (ResList.empty())
@@ -118,10 +103,8 @@ static void reportOverlappingBinding(Module &M, DXILResourceMap &DRM) {
                        "true, yet no overlapping binding was found");
 }
 
-static void
-reportOverlappingRegisters(Module &M,
-                           const llvm::hlsl::BindingInfoBuilder::Binding &R1,
-                           const llvm::hlsl::BindingInfoBuilder::Binding &R2) {
+static void reportOverlappingRegisters(Module &M, const llvm::hlsl::Binding &R1,
+                                       const llvm::hlsl::Binding &R2) {
   SmallString<128> Message;
 
   raw_svector_ostream OS(Message);
@@ -133,6 +116,17 @@ reportOverlappingRegisters(Module &M,
   M.getContext().diagnose(DiagnosticInfoGeneric(Message));
 }
 
+static void
+reportRegNotBound(Module &M, ResourceClass Class,
+                  const llvm::dxil::ResourceInfo::ResourceBinding &Unbound) {
+  SmallString<128> Message;
+  raw_svector_ostream OS(Message);
+  OS << getResourceClassName(Class) << " register " << Unbound.LowerBound
+     << " in space " << Unbound.Space
+     << " does not have a binding in the Root Signature";
+  M.getContext().diagnose(DiagnosticInfoGeneric(Message));
+}
+
 static dxbc::ShaderVisibility
 tripleToVisibility(llvm::Triple::EnvironmentType ET) {
   switch (ET) {
@@ -157,22 +151,23 @@ tripleToVisibility(llvm::Triple::EnvironmentType ET) {
 
 static void validateRootSignature(Module &M,
                                   const mcdxbc::RootSignatureDesc &RSD,
-                                  dxil::ModuleMetadataInfo &MMI) {
+                                  dxil::ModuleMetadataInfo &MMI,
+                                  DXILResourceMap &DRM,
+                                  DXILResourceTypeMap &DRTM) {
 
   hlsl::BindingInfoBuilder Builder;
   dxbc::ShaderVisibility Visibility = tripleToVisibility(MMI.ShaderProfile);
 
   for (const mcdxbc::RootParameterInfo &ParamInfo : RSD.ParametersContainer) {
     dxbc::ShaderVisibility ParamVisibility =
-        static_cast<dxbc::ShaderVisibility>(ParamInfo.Header.ShaderVisibility);
+        dxbc::ShaderVisibility(ParamInfo.Visibility);
     if (ParamVisibility != dxbc::ShaderVisibility::All &&
         ParamVisibility != Visibility)
       continue;
-    dxbc::RootParameterType ParamType =
-        static_cast<dxbc::RootParameterType>(ParamInfo.Header.ParameterType);
+    dxbc::RootParameterType ParamType = dxbc::RootParameterType(ParamInfo.Type);
     switch (ParamType) {
     case dxbc::RootParameterType::Constants32Bit: {
-      dxbc::RTS0::v1::RootConstants Const =
+      mcdxbc::RootConstants Const =
           RSD.ParametersContainer.getConstant(ParamInfo.Location);
       Builder.trackBinding(dxil::ResourceClass::CBuffer, Const.RegisterSpace,
                            Const.ShaderRegister, Const.ShaderRegister,
@@ -183,12 +178,11 @@ static void validateRootSignature(Module &M,
     case dxbc::RootParameterType::SRV:
     case dxbc::RootParameterType::UAV:
     case dxbc::RootParameterType::CBV: {
-      dxbc::RTS0::v2::RootDescriptor Desc =
+      mcdxbc::RootDescriptor Desc =
           RSD.ParametersContainer.getRootDescriptor(ParamInfo.Location);
-      Builder.trackBinding(toResourceClass(static_cast<dxbc::RootParameterType>(
-                               ParamInfo.Header.ParameterType)),
-                           Desc.RegisterSpace, Desc.ShaderRegister,
-                           Desc.ShaderRegister, &ParamInfo);
+      Builder.trackBinding(toResourceClass(ParamInfo.Type), Desc.RegisterSpace,
+                           Desc.ShaderRegister, Desc.ShaderRegister,
+                           &ParamInfo);
 
       break;
     }
@@ -196,16 +190,13 @@ static void validateRootSignature(Module &M,
       const mcdxbc::DescriptorTable &Table =
           RSD.ParametersContainer.getDescriptorTable(ParamInfo.Location);
 
-      for (const dxbc::RTS0::v2::DescriptorRange &Range : Table.Ranges) {
+      for (const mcdxbc::DescriptorRange &Range : Table.Ranges) {
         uint32_t UpperBound =
             Range.NumDescriptors == ~0U
                 ? Range.BaseShaderRegister
                 : Range.BaseShaderRegister + Range.NumDescriptors - 1;
-        Builder.trackBinding(
-            toResourceClass(
-                static_cast<dxbc::DescriptorRangeType>(Range.RangeType)),
-            Range.RegisterSpace, Range.BaseShaderRegister, UpperBound,
-            &ParamInfo);
+        Builder.trackBinding(Range.RangeType, Range.RegisterSpace,
+                             Range.BaseShaderRegister, UpperBound, &ParamInfo);
       }
       break;
     }
@@ -218,11 +209,19 @@ static void validateRootSignature(Module &M,
 
   Builder.calculateBindingInfo(
       [&M](const llvm::hlsl::BindingInfoBuilder &Builder,
-           const llvm::hlsl::BindingInfoBuilder::Binding &ReportedBinding) {
-        const llvm::hlsl::BindingInfoBuilder::Binding &Overlaping =
+           const llvm::hlsl::Binding &ReportedBinding) {
+        const llvm::hlsl::Binding &Overlaping =
             Builder.findOverlapping(ReportedBinding);
         reportOverlappingRegisters(M, ReportedBinding, Overlaping);
       });
+  const hlsl::BoundRegs &BoundRegs = Builder.takeBoundRegs();
+  for (const ResourceInfo &RI : DRM) {
+    const ResourceInfo::ResourceBinding &Binding = RI.getBinding();
+    ResourceClass RC = DRTM[RI.getHandleTy()].getResourceClass();
+    if (!BoundRegs.isBound(RC, Binding.Space, Binding.LowerBound,
+                           Binding.LowerBound + Binding.Size - 1))
+      reportRegNotBound(M, RC, Binding);
+  }
 }
 
 static mcdxbc::RootSignatureDesc *
@@ -236,7 +235,8 @@ getRootSignature(RootSignatureBindingInfo &RSBI,
 static void reportErrors(Module &M, DXILResourceMap &DRM,
                          DXILResourceBindingInfo &DRBI,
                          RootSignatureBindingInfo &RSBI,
-                         dxil::ModuleMetadataInfo &MMI) {
+                         dxil::ModuleMetadataInfo &MMI,
+                         DXILResourceTypeMap &DRTM) {
   if (DRM.hasInvalidCounterDirection())
     reportInvalidDirection(M, DRM);
 
@@ -247,7 +247,7 @@ static void reportErrors(Module &M, DXILResourceMap &DRM,
                                        "DXILResourceImplicitBinding pass");
 
   if (mcdxbc::RootSignatureDesc *RSD = getRootSignature(RSBI, MMI))
-    validateRootSignature(M, *RSD, MMI);
+    validateRootSignature(M, *RSD, MMI, DRM, DRTM);
 }
 
 PreservedAnalyses
@@ -256,8 +256,9 @@ DXILPostOptimizationValidation::run(Module &M, ModuleAnalysisManager &MAM) {
   DXILResourceBindingInfo &DRBI = MAM.getResult<DXILResourceBindingAnalysis>(M);
   RootSignatureBindingInfo &RSBI = MAM.getResult<RootSignatureAnalysis>(M);
   ModuleMetadataInfo &MMI = MAM.getResult<DXILMetadataAnalysis>(M);
+  DXILResourceTypeMap &DRTM = MAM.getResult<DXILResourceTypeAnalysis>(M);
 
-  reportErrors(M, DRM, DRBI, RSBI, MMI);
+  reportErrors(M, DRM, DRBI, RSBI, MMI, DRTM);
   return PreservedAnalyses::all();
 }
 
@@ -273,8 +274,10 @@ public:
         getAnalysis<RootSignatureAnalysisWrapper>().getRSInfo();
     dxil::ModuleMetadataInfo &MMI =
         getAnalysis<DXILMetadataAnalysisWrapperPass>().getModuleMetadata();
+    DXILResourceTypeMap &DRTM =
+        getAnalysis<DXILResourceTypeWrapperPass>().getResourceTypeMap();
 
-    reportErrors(M, DRM, DRBI, RSBI, MMI);
+    reportErrors(M, DRM, DRBI, RSBI, MMI, DRTM);
     return false;
   }
   StringRef getPassName() const override {
@@ -288,6 +291,7 @@ public:
     AU.addRequired<DXILResourceBindingWrapperPass>();
     AU.addRequired<DXILMetadataAnalysisWrapperPass>();
     AU.addRequired<RootSignatureAnalysisWrapper>();
+    AU.addRequired<DXILResourceTypeWrapperPass>();
     AU.addPreserved<DXILResourceWrapperPass>();
     AU.addPreserved<DXILResourceBindingWrapperPass>();
     AU.addPreserved<DXILMetadataAnalysisWrapperPass>();
@@ -305,6 +309,7 @@ INITIALIZE_PASS_DEPENDENCY(DXILResourceTypeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DXILResourceWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DXILMetadataAnalysisWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(RootSignatureAnalysisWrapper)
+INITIALIZE_PASS_DEPENDENCY(DXILResourceTypeWrapperPass)
 INITIALIZE_PASS_END(DXILPostOptimizationValidationLegacy, DEBUG_TYPE,
                     "DXIL Post Optimization Validation", false, false)
 
diff --git a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp
index c33ec0efd73c..6579d3405cf3 100644
--- a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp
+++ b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp
@@ -8,14 +8,19 @@
 
 #include "DXILResourceAccess.h"
 #include "DirectX.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/DXILResource.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsDirectX.h"
+#include "llvm/IR/User.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
 
 #define DEBUG_TYPE "dxil-resource-access"
 
@@ -198,6 +203,112 @@ static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, Value *Offset,
   llvm_unreachable("Unhandled case in switch");
 }
 
+static SmallVector<Instruction *> collectBlockUseDef(Instruction *Start) {
+  SmallPtrSet<Instruction *, 32> Visited;
+  SmallVector<Instruction *, 32> Worklist;
+  SmallVector<Instruction *> Out;
+  auto *BB = Start->getParent();
+
+  // Seed with direct users in this block.
+  for (User *U : Start->users()) {
+    if (auto *I = dyn_cast<Instruction>(U)) {
+      if (I->getParent() == BB)
+        Worklist.push_back(I);
+    }
+  }
+
+  // BFS over transitive users, constrained to the same block.
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.pop_back_val();
+    if (!Visited.insert(I).second)
+      continue;
+    Out.push_back(I);
+
+    for (User *U : I->users()) {
+      if (auto *J = dyn_cast<Instruction>(U)) {
+        if (J->getParent() == BB)
+          Worklist.push_back(J);
+      }
+    }
+    for (Use &V : I->operands()) {
+      if (auto *J = dyn_cast<Instruction>(V)) {
+        if (J->getParent() == BB && V != Start)
+          Worklist.push_back(J);
+      }
+    }
+  }
+
+  // Order results in program order.
+  DenseMap<const Instruction *, unsigned> Ord;
+  unsigned Idx = 0;
+  for (Instruction &I : *BB)
+    Ord[&I] = Idx++;
+
+  llvm::sort(Out, [&](Instruction *A, Instruction *B) {
+    return Ord.lookup(A) < Ord.lookup(B);
+  });
+
+  return Out;
+}
+
+static void phiNodeRemapHelper(PHINode *Phi, BasicBlock *BB,
+                               IRBuilder<> &Builder,
+                               SmallVector<Instruction *> &UsesInBlock) {
+
+  ValueToValueMapTy VMap;
+  Value *Val = Phi->getIncomingValueForBlock(BB);
+  VMap[Phi] = Val;
+  Builder.SetInsertPoint(&BB->back());
+  for (Instruction *I : UsesInBlock) {
+    // don't clone over the Phi just remap them
+    if (auto *PhiNested = dyn_cast<PHINode>(I)) {
+      VMap[PhiNested] = PhiNested->getIncomingValueForBlock(BB);
+      continue;
+    }
+    Instruction *Clone = I->clone();
+    RemapInstruction(Clone, VMap,
+                     RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+    Builder.Insert(Clone);
+    VMap[I] = Clone;
+  }
+}
+
+static void phiNodeReplacement(IntrinsicInst *II,
+                               SmallVectorImpl<Instruction *> &PrevBBDeadInsts,
+                               SetVector<BasicBlock *> &DeadBB) {
+  SmallVector<Instruction *> CurrBBDeadInsts;
+  for (User *U : II->users()) {
+    auto *Phi = dyn_cast<PHINode>(U);
+    if (!Phi)
+      continue;
+
+    IRBuilder<> Builder(Phi);
+    SmallVector<Instruction *> UsesInBlock = collectBlockUseDef(Phi);
+    bool HasReturnUse = isa<ReturnInst>(UsesInBlock.back());
+
+    for (unsigned I = 0, E = Phi->getNumIncomingValues(); I < E; I++) {
+      auto *CurrIncomingBB = Phi->getIncomingBlock(I);
+      phiNodeRemapHelper(Phi, CurrIncomingBB, Builder, UsesInBlock);
+      if (HasReturnUse)
+        PrevBBDeadInsts.push_back(&CurrIncomingBB->back());
+    }
+
+    CurrBBDeadInsts.push_back(Phi);
+
+    for (Instruction *I : UsesInBlock) {
+      CurrBBDeadInsts.push_back(I);
+    }
+    if (HasReturnUse) {
+      BasicBlock *PhiBB = Phi->getParent();
+      DeadBB.insert(PhiBB);
+    }
+  }
+  // Traverse the now-dead instructions in RPO and remove them.
+  for (Instruction *Dead : llvm::reverse(CurrBBDeadInsts))
+    Dead->eraseFromParent();
+  CurrBBDeadInsts.clear();
+}
+
 static void replaceAccess(IntrinsicInst *II, dxil::ResourceTypeInfo &RTI) {
   // Process users keeping track of indexing accumulated from GEPs.
   struct AccessAndOffset {
@@ -229,7 +340,6 @@ static void replaceAccess(IntrinsicInst *II, dxil::ResourceTypeInfo &RTI) {
     } else if (auto *LI = dyn_cast<LoadInst>(Current.Access)) {
       createLoadIntrinsic(II, LI, Current.Offset, RTI);
       DeadInsts.push_back(LI);
-
     } else
       llvm_unreachable("Unhandled instruction - pointer escaped?");
   }
@@ -242,13 +352,27 @@ static void replaceAccess(IntrinsicInst *II, dxil::ResourceTypeInfo &RTI) {
 
 static bool transformResourcePointers(Function &F, DXILResourceTypeMap &DRTM) {
   SmallVector<std::pair<IntrinsicInst *, dxil::ResourceTypeInfo>> Resources;
-  for (BasicBlock &BB : F)
+  SetVector<BasicBlock *> DeadBB;
+  SmallVector<Instruction *> PrevBBDeadInsts;
+  for (BasicBlock &BB : make_early_inc_range(F)) {
+    for (Instruction &I : make_early_inc_range(BB))
+      if (auto *II = dyn_cast<IntrinsicInst>(&I))
+        if (II->getIntrinsicID() == Intrinsic::dx_resource_getpointer)
+          phiNodeReplacement(II, PrevBBDeadInsts, DeadBB);
+
     for (Instruction &I : BB)
       if (auto *II = dyn_cast<IntrinsicInst>(&I))
         if (II->getIntrinsicID() == Intrinsic::dx_resource_getpointer) {
           auto *HandleTy = cast<TargetExtType>(II->getArgOperand(0)->getType());
           Resources.emplace_back(II, DRTM[HandleTy]);
         }
+  }
+  for (auto *Dead : PrevBBDeadInsts)
+    Dead->eraseFromParent();
+  PrevBBDeadInsts.clear();
+  for (auto *Dead : DeadBB)
+    Dead->eraseFromParent();
+  DeadBB.clear();
 
   for (auto &[II, RI] : Resources)
     replaceAccess(II, RI);
@@ -279,7 +403,6 @@ public:
   bool runOnFunction(Function &F) override {
     DXILResourceTypeMap &DRTM =
         getAnalysis<DXILResourceTypeWrapperPass>().getResourceTypeMap();
-
     return transformResourcePointers(F, DRTM);
   }
   StringRef getPassName() const override { return "DXIL Resource Access"; }
diff --git a/llvm/lib/Target/DirectX/DXILResourceImplicitBinding.cpp b/llvm/lib/Target/DirectX/DXILResourceImplicitBinding.cpp
index 6e69c5ac1d63..b0d9ad8da10e 100644
--- a/llvm/lib/Target/DirectX/DXILResourceImplicitBinding.cpp
+++ b/llvm/lib/Target/DirectX/DXILResourceImplicitBinding.cpp
@@ -111,8 +111,7 @@ static bool assignBindings(Module &M, DXILResourceBindingInfo &DRBI,
          RegSlotOp,                /* register slot */
          IB.Call->getOperand(2),   /* size */
          IB.Call->getOperand(3),   /* index */
-         IB.Call->getOperand(4),   /* non-uniform flag */
-         IB.Call->getOperand(5)}); /* name */
+         IB.Call->getOperand(4)}); /* name */
     IB.Call->replaceAllUsesWith(NewCall);
     IB.Call->eraseFromParent();
     Changed = true;
diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp
index a4f5086c2f42..ac3c7dde6b89 100644
--- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp
+++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp
@@ -24,9 +24,11 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/MC/DXContainerRootSignature.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstdint>
 
@@ -70,6 +72,13 @@ analyzeModule(Module &M) {
   if (RootSignatureNode == nullptr)
     return RSDMap;
 
+  bool AllowNullFunctions = false;
+  if (M.getTargetTriple().getEnvironment() ==
+      Triple::EnvironmentType::RootSignature) {
+    assert(RootSignatureNode->getNumOperands() == 1);
+    AllowNullFunctions = true;
+  }
+
   for (const auto &RSDefNode : RootSignatureNode->operands()) {
     if (RSDefNode->getNumOperands() != 3) {
       reportError(Ctx, "Invalid Root Signature metadata - expected function, "
@@ -78,24 +87,28 @@ analyzeModule(Module &M) {
     }
 
     // Function was pruned during compilation.
-    const MDOperand &FunctionPointerMdNode = RSDefNode->getOperand(0);
-    if (FunctionPointerMdNode == nullptr) {
-      reportError(
-          Ctx, "Function associated with Root Signature definition is null.");
-      continue;
-    }
+    Function *F = nullptr;
+
+    if (!AllowNullFunctions) {
+      const MDOperand &FunctionPointerMdNode = RSDefNode->getOperand(0);
+      if (FunctionPointerMdNode == nullptr) {
+        reportError(
+            Ctx, "Function associated with Root Signature definition is null.");
+        continue;
+      }
 
-    ValueAsMetadata *VAM =
-        llvm::dyn_cast<ValueAsMetadata>(FunctionPointerMdNode.get());
-    if (VAM == nullptr) {
-      reportError(Ctx, "First element of root signature is not a Value");
-      continue;
-    }
+      ValueAsMetadata *VAM =
+          llvm::dyn_cast<ValueAsMetadata>(FunctionPointerMdNode.get());
+      if (VAM == nullptr) {
+        reportError(Ctx, "First element of root signature is not a Value");
+        continue;
+      }
 
-    Function *F = dyn_cast<Function>(VAM->getValue());
-    if (F == nullptr) {
-      reportError(Ctx, "First element of root signature is not a Function");
-      continue;
+      F = dyn_cast<Function>(VAM->getValue());
+      if (F == nullptr) {
+        reportError(Ctx, "First element of root signature is not a Function");
+        continue;
+      }
     }
 
     Metadata *RootElementListOperand = RSDefNode->getOperand(1).get();
@@ -171,41 +184,41 @@ PreservedAnalyses RootSignatureAnalysisPrinter::run(Module &M,
        << "RootParametersOffset: " << RS.RootParameterOffset << "\n"
        << "NumParameters: " << RS.ParametersContainer.size() << "\n";
     for (size_t I = 0; I < RS.ParametersContainer.size(); I++) {
-      const auto &[Type, Loc] =
-          RS.ParametersContainer.getTypeAndLocForParameter(I);
-      const dxbc::RTS0::v1::RootParameterHeader Header =
-          RS.ParametersContainer.getHeader(I);
-
-      OS << "- Parameter Type: " << Type << "\n"
-         << "  Shader Visibility: " << Header.ShaderVisibility << "\n";
-
-      switch (Type) {
-      case llvm::to_underlying(dxbc::RootParameterType::Constants32Bit): {
-        const dxbc::RTS0::v1::RootConstants &Constants =
-            RS.ParametersContainer.getConstant(Loc);
+      const mcdxbc::RootParameterInfo &Info = RS.ParametersContainer.getInfo(I);
+
+      OS << "- Parameter Type: "
+         << enumToStringRef(Info.Type, dxbc::getRootParameterTypes()) << "\n"
+         << "  Shader Visibility: "
+         << enumToStringRef(Info.Visibility, dxbc::getShaderVisibility())
+         << "\n";
+      switch (Info.Type) {
+      case dxbc::RootParameterType::Constants32Bit: {
+        const mcdxbc::RootConstants &Constants =
+            RS.ParametersContainer.getConstant(Info.Location);
         OS << "  Register Space: " << Constants.RegisterSpace << "\n"
            << "  Shader Register: " << Constants.ShaderRegister << "\n"
            << "  Num 32 Bit Values: " << Constants.Num32BitValues << "\n";
         break;
       }
-      case llvm::to_underlying(dxbc::RootParameterType::CBV):
-      case llvm::to_underlying(dxbc::RootParameterType::UAV):
-      case llvm::to_underlying(dxbc::RootParameterType::SRV): {
-        const dxbc::RTS0::v2::RootDescriptor &Descriptor =
-            RS.ParametersContainer.getRootDescriptor(Loc);
+      case dxbc::RootParameterType::CBV:
+      case dxbc::RootParameterType::UAV:
+      case dxbc::RootParameterType::SRV: {
+        const mcdxbc::RootDescriptor &Descriptor =
+            RS.ParametersContainer.getRootDescriptor(Info.Location);
         OS << "  Register Space: " << Descriptor.RegisterSpace << "\n"
            << "  Shader Register: " << Descriptor.ShaderRegister << "\n";
         if (RS.Version > 1)
           OS << "  Flags: " << Descriptor.Flags << "\n";
         break;
       }
-      case llvm::to_underlying(dxbc::RootParameterType::DescriptorTable): {
+      case dxbc::RootParameterType::DescriptorTable: {
         const mcdxbc::DescriptorTable &Table =
-            RS.ParametersContainer.getDescriptorTable(Loc);
+            RS.ParametersContainer.getDescriptorTable(Info.Location);
         OS << "  NumRanges: " << Table.Ranges.size() << "\n";
 
-        for (const dxbc::RTS0::v2::DescriptorRange Range : Table) {
-          OS << "  - Range Type: " << Range.RangeType << "\n"
+        for (const mcdxbc::DescriptorRange &Range : Table) {
+          OS << "  - Range Type: "
+             << dxil::getResourceClassName(Range.RangeType) << "\n"
              << "    Register Space: " << Range.RegisterSpace << "\n"
              << "    Base Shader Register: " << Range.BaseShaderRegister << "\n"
              << "    Num Descriptors: " << Range.NumDescriptors << "\n"
diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
index 82bcacee7a6d..9eebcc9b1306 100644
--- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
+++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
@@ -127,6 +127,8 @@ static StringRef getShortShaderStage(Triple::EnvironmentType Env) {
     return "ms";
   case Triple::Amplification:
     return "as";
+  case Triple::RootSignature:
+    return "rootsig";
   default:
     break;
   }
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
index 1d79c3018439..bc1a3a7995bd 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
@@ -2113,7 +2113,7 @@ void DXILBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal,
         }
         break;
       case Instruction::GetElementPtr: {
-        Code = bitc::CST_CODE_CE_GEP;
+        Code = bitc::CST_CODE_CE_GEP_OLD;
         const auto *GO = cast<GEPOperator>(C);
         if (GO->isInBounds())
           Code = bitc::CST_CODE_CE_INBOUNDS_GEP;
diff --git a/llvm/lib/Target/DirectX/DirectXIRPasses/PointerTypeAnalysis.cpp b/llvm/lib/Target/DirectX/DirectXIRPasses/PointerTypeAnalysis.cpp
index f99bb4f4eaee..c2e139edc6bd 100644
--- a/llvm/lib/Target/DirectX/DirectXIRPasses/PointerTypeAnalysis.cpp
+++ b/llvm/lib/Target/DirectX/DirectXIRPasses/PointerTypeAnalysis.cpp
@@ -15,25 +15,39 @@
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
 
 using namespace llvm;
 using namespace llvm::dxil;
 
 namespace {
 
+Type *classifyFunctionType(const Function &F, PointerTypeMap &Map);
+
 // Classifies the type of the value passed in by walking the value's users to
 // find a typed instruction to materialize a type from.
 Type *classifyPointerType(const Value *V, PointerTypeMap &Map) {
   assert(V->getType()->isPointerTy() &&
          "classifyPointerType called with non-pointer");
+
+  // A CallInst will trigger this case, and we want to classify its Function
+  // operand as a Function rather than a generic Value.
+  if (const Function *F = dyn_cast<Function>(V))
+    return classifyFunctionType(*F, Map);
+
+  // There can potentially be dead constants hanging off of the globals we do
+  // not want to deal with. So we remove them here.
+  if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+    GV->removeDeadConstantUsers();
+
   auto It = Map.find(V);
   if (It != Map.end())
     return It->second;
 
   Type *PointeeTy = nullptr;
-  if (auto *Inst = dyn_cast<GetElementPtrInst>(V)) {
-    if (!Inst->getResultElementType()->isPointerTy())
-      PointeeTy = Inst->getResultElementType();
+  if (auto *GEP = dyn_cast<GEPOperator>(V)) {
+    if (!GEP->getResultElementType()->isPointerTy())
+      PointeeTy = GEP->getResultElementType();
   } else if (auto *Inst = dyn_cast<AllocaInst>(V)) {
     PointeeTy = Inst->getAllocatedType();
   } else if (auto *GV = dyn_cast<GlobalVariable>(V)) {
@@ -49,8 +63,8 @@ Type *classifyPointerType(const Value *V, PointerTypeMap &Map) {
       // When store value is ptr type, cannot get more type info.
       if (NewPointeeTy->isPointerTy())
         continue;
-    } else if (const auto *Inst = dyn_cast<GetElementPtrInst>(User)) {
-      NewPointeeTy = Inst->getSourceElementType();
+    } else if (const auto *GEP = dyn_cast<GEPOperator>(User)) {
+      NewPointeeTy = GEP->getSourceElementType();
     }
     if (NewPointeeTy) {
       // HLSL doesn't support pointers, so it is unlikely to get more than one
@@ -204,6 +218,9 @@ PointerTypeMap PointerTypeAnalysis::run(const Module &M) {
       for (const auto &I : B) {
         if (I.getType()->isPointerTy())
           classifyPointerType(&I, Map);
+        for (const auto &O : I.operands())
+          if (O.get()->getType()->isPointerTy())
+            classifyPointerType(O.get(), Map);
       }
     }
   }
diff --git a/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp b/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp
index 07b68648f16c..bb2efa43d818 100644
--- a/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp
+++ b/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp
@@ -11,10 +11,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "DirectXInstrInfo.h"
+#include "DirectXSubtarget.h"
 
 #define GET_INSTRINFO_CTOR_DTOR
 #include "DirectXGenInstrInfo.inc"
 
 using namespace llvm;
 
+DirectXInstrInfo::DirectXInstrInfo(const DirectXSubtarget &STI)
+    : DirectXGenInstrInfo(STI) {}
+
 DirectXInstrInfo::~DirectXInstrInfo() {}
diff --git a/llvm/lib/Target/DirectX/DirectXInstrInfo.h b/llvm/lib/Target/DirectX/DirectXInstrInfo.h
index e2c7036fc74a..57ede28030b2 100644
--- a/llvm/lib/Target/DirectX/DirectXInstrInfo.h
+++ b/llvm/lib/Target/DirectX/DirectXInstrInfo.h
@@ -20,9 +20,11 @@
 #include "DirectXGenInstrInfo.inc"
 
 namespace llvm {
+class DirectXSubtarget;
+
 struct DirectXInstrInfo : public DirectXGenInstrInfo {
   const DirectXRegisterInfo RI;
-  explicit DirectXInstrInfo() : DirectXGenInstrInfo() {}
+  explicit DirectXInstrInfo(const DirectXSubtarget &STI);
   const DirectXRegisterInfo &getRegisterInfo() const { return RI; }
   ~DirectXInstrInfo() override;
 };
diff --git a/llvm/lib/Target/DirectX/DirectXSubtarget.cpp b/llvm/lib/Target/DirectX/DirectXSubtarget.cpp
index 526b7d29fb13..f8519177cc2d 100644
--- a/llvm/lib/Target/DirectX/DirectXSubtarget.cpp
+++ b/llvm/lib/Target/DirectX/DirectXSubtarget.cpp
@@ -24,6 +24,7 @@ using namespace llvm;
 
 DirectXSubtarget::DirectXSubtarget(const Triple &TT, StringRef CPU,
                                    StringRef FS, const DirectXTargetMachine &TM)
-    : DirectXGenSubtargetInfo(TT, CPU, CPU, FS), FL(*this), TL(TM, *this) {}
+    : DirectXGenSubtargetInfo(TT, CPU, CPU, FS), InstrInfo(*this), FL(*this),
+      TL(TM, *this) {}
 
 void DirectXSubtarget::anchor() {}
diff --git a/llvm/lib/Target/DirectX/DirectXSubtarget.h b/llvm/lib/Target/DirectX/DirectXSubtarget.h
index b2374caaf3cd..f3d71c4c4e3b 100644
--- a/llvm/lib/Target/DirectX/DirectXSubtarget.h
+++ b/llvm/lib/Target/DirectX/DirectXSubtarget.h
@@ -28,9 +28,9 @@ namespace llvm {
 class DirectXTargetMachine;
 
 class DirectXSubtarget : public DirectXGenSubtargetInfo {
+  DirectXInstrInfo InstrInfo;
   DirectXFrameLowering FL;
   DirectXTargetLowering TL;
-  DirectXInstrInfo InstrInfo;
 
   virtual void anchor(); // virtual anchor method
 
diff --git a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index de10092cbe3c..0639878c1256 100644
--- a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -173,6 +173,19 @@ static DecodeStatus s32_0ImmDecoder(MCInst &MI, unsigned tmp,
                                     const MCDisassembler *Decoder);
 static DecodeStatus brtargetDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
                                     const MCDisassembler *Decoder);
+
+static DecodeStatus n1ConstDecoder(MCInst &MI, const MCDisassembler *Decoder) {
+  MCContext &Ctx = Decoder->getContext();
+  MI.addOperand(MCOperand::createExpr(MCConstantExpr::create(-1, Ctx)));
+  return DecodeStatus::Success;
+}
+
+static DecodeStatus sgp10ConstDecoder(MCInst &MI,
+                                      const MCDisassembler *Decoder) {
+  MI.addOperand(MCOperand::createReg(Hexagon::SGP1_0));
+  return DecodeStatus::Success;
+}
+
 #include "HexagonDepDecoders.inc"
 #include "HexagonGenDisassemblerTables.inc"
 
@@ -349,21 +362,6 @@ void HexagonDisassembler::remapInstruction(MCInst &Instr) const {
   }
 }
 
-static void adjustDuplex(MCInst &MI, MCContext &Context) {
-  switch (MI.getOpcode()) {
-  case Hexagon::SA1_setin1:
-    MI.insert(MI.begin() + 1,
-              MCOperand::createExpr(MCConstantExpr::create(-1, Context)));
-    break;
-  case Hexagon::SA1_dec:
-    MI.insert(MI.begin() + 2,
-              MCOperand::createExpr(MCConstantExpr::create(-1, Context)));
-    break;
-  default:
-    break;
-  }
-}
-
 DecodeStatus HexagonDisassembler::getSingleInstruction(MCInst &MI, MCInst &MCB,
                                                        ArrayRef<uint8_t> Bytes,
                                                        uint64_t Address,
@@ -468,12 +466,10 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(MCInst &MI, MCInst &MCB,
     CurrentExtender = TmpExtender;
     if (Result != DecodeStatus::Success)
       return DecodeStatus::Fail;
-    adjustDuplex(*MILow, getContext());
     Result = decodeInstruction(
         DecodeHigh, *MIHigh, (Instruction >> 16) & 0x1fff, Address, this, STI);
     if (Result != DecodeStatus::Success)
       return DecodeStatus::Fail;
-    adjustDuplex(*MIHigh, getContext());
     MCOperand OPLow = MCOperand::createInst(MILow);
     MCOperand OPHigh = MCOperand::createInst(MIHigh);
     MI.addOperand(OPLow);
@@ -499,41 +495,6 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(MCInst &MI, MCInst &MCB,
 
   }
 
-  switch (MI.getOpcode()) {
-  case Hexagon::J4_cmpeqn1_f_jumpnv_nt:
-  case Hexagon::J4_cmpeqn1_f_jumpnv_t:
-  case Hexagon::J4_cmpeqn1_fp0_jump_nt:
-  case Hexagon::J4_cmpeqn1_fp0_jump_t:
-  case Hexagon::J4_cmpeqn1_fp1_jump_nt:
-  case Hexagon::J4_cmpeqn1_fp1_jump_t:
-  case Hexagon::J4_cmpeqn1_t_jumpnv_nt:
-  case Hexagon::J4_cmpeqn1_t_jumpnv_t:
-  case Hexagon::J4_cmpeqn1_tp0_jump_nt:
-  case Hexagon::J4_cmpeqn1_tp0_jump_t:
-  case Hexagon::J4_cmpeqn1_tp1_jump_nt:
-  case Hexagon::J4_cmpeqn1_tp1_jump_t:
-  case Hexagon::J4_cmpgtn1_f_jumpnv_nt:
-  case Hexagon::J4_cmpgtn1_f_jumpnv_t:
-  case Hexagon::J4_cmpgtn1_fp0_jump_nt:
-  case Hexagon::J4_cmpgtn1_fp0_jump_t:
-  case Hexagon::J4_cmpgtn1_fp1_jump_nt:
-  case Hexagon::J4_cmpgtn1_fp1_jump_t:
-  case Hexagon::J4_cmpgtn1_t_jumpnv_nt:
-  case Hexagon::J4_cmpgtn1_t_jumpnv_t:
-  case Hexagon::J4_cmpgtn1_tp0_jump_nt:
-  case Hexagon::J4_cmpgtn1_tp0_jump_t:
-  case Hexagon::J4_cmpgtn1_tp1_jump_nt:
-  case Hexagon::J4_cmpgtn1_tp1_jump_t:
-    MI.insert(MI.begin() + 1,
-              MCOperand::createExpr(MCConstantExpr::create(-1, getContext())));
-    break;
-  case Hexagon::Y4_crswap10:
-    MI.addOperand(MCOperand::createReg(Hexagon::SGP1_0));
-    break;
-  default:
-    break;
-  }
-
   if (HexagonMCInstrInfo::isNewValue(*MCII, MI)) {
     unsigned OpIndex = HexagonMCInstrInfo::getNewValueOp(*MCII, MI);
     MCOperand &MCO = MI.getOperand(OpIndex);
diff --git a/llvm/lib/Target/Hexagon/Hexagon.td b/llvm/lib/Target/Hexagon/Hexagon.td
index 0dbe743d13ed..6d0529fb4277 100644
--- a/llvm/lib/Target/Hexagon/Hexagon.td
+++ b/llvm/lib/Target/Hexagon/Hexagon.td
@@ -176,8 +176,11 @@ def UseSmallData       : Predicate<"HST->useSmallData()">;
 def UseCabac           : Predicate<"HST->useCabac()">,
                          AssemblerPredicate<(any_of FeatureCabac)>;
 
-def Hvx64:  HwMode<"+hvx-length64b", [UseHVX64B]>;
-def Hvx128: HwMode<"+hvx-length128b", [UseHVX128B]>;
+def : HwModePredicateProlog<[{
+  const auto *HST = static_cast<const HexagonSubtarget *>(this);
+}]>;
+def Hvx64:  HwMode<[UseHVX64B]>;
+def Hvx128: HwMode<[UseHVX128B]>;
 
 //===----------------------------------------------------------------------===//
 // Classes used for relation maps.
diff --git a/llvm/lib/Target/Hexagon/HexagonDepInstrFormats.td b/llvm/lib/Target/Hexagon/HexagonDepInstrFormats.td
index 75e87c95f2c4..f48695c6ebc0 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepInstrFormats.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepInstrFormats.td
@@ -38,11 +38,7 @@ class Enc_041d7b : OpcodeHexagon {
   let Inst{7-1} = Ii{8-2};
   bits <4> Rs16;
   let Inst{19-16} = Rs16{3-0};
-  bits <5> n1;
-  let Inst{28-28} = n1{4-4};
-  let Inst{24-23} = n1{3-2};
-  let Inst{13-13} = n1{1-1};
-  let Inst{8-8} = n1{0-0};
+  bits <0> n1;
 }
 class Enc_046afa : OpcodeHexagon {
   bits <1> Mu2;
@@ -244,10 +240,7 @@ class Enc_14640c : OpcodeHexagon {
   let Inst{7-1} = Ii{8-2};
   bits <4> Rs16;
   let Inst{19-16} = Rs16{3-0};
-  bits <5> n1;
-  let Inst{28-28} = n1{4-4};
-  let Inst{24-22} = n1{3-1};
-  let Inst{13-13} = n1{0-0};
+  bits <0> n1;
 }
 class Enc_14d27a : OpcodeHexagon {
   bits <5> II;
@@ -300,11 +293,7 @@ class Enc_178717 : OpcodeHexagon {
   let Inst{7-1} = Ii{8-2};
   bits <4> Rs16;
   let Inst{19-16} = Rs16{3-0};
-  bits <6> n1;
-  let Inst{28-28} = n1{5-5};
-  let Inst{25-23} = n1{4-2};
-  let Inst{13-13} = n1{1-1};
-  let Inst{8-8} = n1{0-0};
+  bits <0> n1;
 }
 class Enc_179b35 : OpcodeHexagon {
   bits <5> Rs32;
@@ -384,9 +373,7 @@ class Enc_1de724 : OpcodeHexagon {
   let Inst{7-1} = Ii{8-2};
   bits <4> Rs16;
   let Inst{19-16} = Rs16{3-0};
-  bits <4> n1;
-  let Inst{28-28} = n1{3-3};
-  let Inst{24-22} = n1{2-0};
+  bits <0> n1;
 }
 class Enc_1ef990 : OpcodeHexagon {
   bits <2> Pv4;
@@ -772,10 +759,7 @@ class Enc_3694bd : OpcodeHexagon {
   let Inst{7-1} = Ii{8-2};
   bits <3> Ns8;
   let Inst{18-16} = Ns8{2-0};
-  bits <5> n1;
-  let Inst{29-29} = n1{4-4};
-  let Inst{26-25} = n1{3-2};
-  let Inst{23-22} = n1{1-0};
+  bits <0> n1;
 }
 class Enc_372c9d : OpcodeHexagon {
   bits <2> Pv4;
@@ -820,10 +804,7 @@ class Enc_3a2484 : OpcodeHexagon {
   let Inst{7-1} = Ii{8-2};
   bits <4> Rs16;
   let Inst{19-16} = Rs16{3-0};
-  bits <4> n1;
-  let Inst{28-28} = n1{3-3};
-  let Inst{24-23} = n1{2-1};
-  let Inst{13-13} = n1{0-0};
+  bits <0> n1;
 }
 class Enc_3a3d62 : OpcodeHexagon {
   bits <5> Rs32;
@@ -883,10 +864,7 @@ class Enc_3e3989 : OpcodeHexagon {
   let Inst{7-1} = Ii{8-2};
   bits <4> Rs16;
   let Inst{19-16} = Rs16{3-0};
-  bits <6> n1;
-  let Inst{28-28} = n1{5-5};
-  let Inst{25-22} = n1{4-1};
-  let Inst{8-8} = n1{0-0};
+  bits <0> n1;
 }
 class Enc_3f97c8 : OpcodeHexagon {
   bits <6> Ii;
@@ -916,9 +894,7 @@ class Enc_405228 : OpcodeHexagon {
   let Inst{7-1} = Ii{8-2};
   bits <4> Rs16;
   let Inst{19-16} = Rs16{3-0};
-  bits <3> n1;
-  let Inst{28-28} = n1{2-2};
-  let Inst{24-23} = n1{1-0};
+  bits <0> n1;
 }
 class Enc_412ff0 : OpcodeHexagon {
   bits <5> Rss32;
@@ -1046,9 +1022,7 @@ class Enc_4aca3a : OpcodeHexagon {
   let Inst{7-1} = Ii{8-2};
   bits <3> Ns8;
   let Inst{18-16} = Ns8{2-0};
-  bits <3> n1;
-  let Inst{29-29} = n1{2-2};
-  let Inst{26-25} = n1{1-0};
+  bits <0> n1;
 }
 class Enc_4b39e4 : OpcodeHexagon {
   bits <3> Ii;
@@ -1265,11 +1239,7 @@ class Enc_5a18b3 : OpcodeHexagon {
   let Inst{7-1} = Ii{8-2};
   bits <3> Ns8;
   let Inst{18-16} = Ns8{2-0};
-  bits <5> n1;
-  let Inst{29-29} = n1{4-4};
-  let Inst{26-25} = n1{3-2};
-  let Inst{22-22} = n1{1-1};
-  let Inst{13-13} = n1{0-0};
+  bits <0> n1;
 }
 class Enc_5ab2be : OpcodeHexagon {
   bits <5> Rs32;
@@ -1445,11 +1415,7 @@ class Enc_6413b6 : OpcodeHexagon {
   let Inst{7-1} = Ii{8-2};
   bits <3> Ns8;
   let Inst{18-16} = Ns8{2-0};
-  bits <5> n1;
-  let Inst{29-29} = n1{4-4};
-  let Inst{26-25} = n1{3-2};
-  let Inst{23-23} = n1{1-1};
-  let Inst{13-13} = n1{0-0};
+  bits <0> n1;
 }
 class Enc_645d54 : OpcodeHexagon {
   bits <2> Ii;
@@ -1490,9 +1456,7 @@ class Enc_668704 : OpcodeHexagon {
   let Inst{7-1} = Ii{8-2};
   bits <4> Rs16;
   let Inst{19-16} = Rs16{3-0};
-  bits <5> n1;
-  let Inst{28-28} = n1{4-4};
-  let Inst{25-22} = n1{3-0};
+  bits <0> n1;
 }
 class Enc_66bce1 : OpcodeHexagon {
   bits <11> Ii;
@@ -1650,9 +1614,7 @@ class Enc_736575 : OpcodeHexagon {
   let Inst{7-1} = Ii{8-2};
   bits <4> Rs16;
   let Inst{19-16} = Rs16{3-0};
-  bits <4> n1;
-  let Inst{28-28} = n1{3-3};
-  let Inst{25-23} = n1{2-0};
+  bits <0> n1;
 }
 class Enc_74aef2 : OpcodeHexagon {
   bits <4> Ii;
@@ -1718,8 +1680,7 @@ class Enc_79b8c8 : OpcodeHexagon {
 class Enc_7a0ea6 : OpcodeHexagon {
   bits <4> Rd16;
   let Inst{3-0} = Rd16{3-0};
-  bits <1> n1;
-  let Inst{9-9} = n1{0-0};
+  bits <0> n1;
 }
 class Enc_7b523d : OpcodeHexagon {
   bits <5> Vu32;
@@ -1805,10 +1766,7 @@ class Enc_800e04 : OpcodeHexagon {
   let Inst{7-1} = Ii{8-2};
   bits <4> Rs16;
   let Inst{19-16} = Rs16{3-0};
-  bits <6> n1;
-  let Inst{28-28} = n1{5-5};
-  let Inst{25-22} = n1{4-1};
-  let Inst{13-13} = n1{0-0};
+  bits <0> n1;
 }
 class Enc_80296d : OpcodeHexagon {
   bits <5> Rs32;
@@ -2067,10 +2025,7 @@ class Enc_8e583a : OpcodeHexagon {
   let Inst{7-1} = Ii{8-2};
   bits <4> Rs16;
   let Inst{19-16} = Rs16{3-0};
-  bits <5> n1;
-  let Inst{28-28} = n1{4-4};
-  let Inst{25-23} = n1{3-1};
-  let Inst{13-13} = n1{0-0};
+  bits <0> n1;
 }
 class Enc_8f7633 : OpcodeHexagon {
   bits <5> Rs32;
@@ -2361,10 +2316,7 @@ class Enc_a42857 : OpcodeHexagon {
   let Inst{7-1} = Ii{8-2};
   bits <4> Rs16;
   let Inst{19-16} = Rs16{3-0};
-  bits <5> n1;
-  let Inst{28-28} = n1{4-4};
-  let Inst{24-22} = n1{3-1};
-  let Inst{8-8} = n1{0-0};
+  bits <0> n1;
 }
 class Enc_a4ef14 : OpcodeHexagon {
   bits <5> Rd32;
@@ -2413,11 +2365,7 @@ class Enc_a6853f : OpcodeHexagon {
   let Inst{7-1} = Ii{8-2};
   bits <3> Ns8;
   let Inst{18-16} = Ns8{2-0};
-  bits <6> n1;
-  let Inst{29-29} = n1{5-5};
-  let Inst{26-25} = n1{4-3};
-  let Inst{23-22} = n1{2-1};
-  let Inst{13-13} = n1{0-0};
+  bits <0> n1;
 }
 class Enc_a6ce9c : OpcodeHexagon {
   bits <6> Ii;
@@ -2593,10 +2541,7 @@ class Enc_b1e1fb : OpcodeHexagon {
   let Inst{7-1} = Ii{8-2};
   bits <4> Rs16;
   let Inst{19-16} = Rs16{3-0};
-  bits <5> n1;
-  let Inst{28-28} = n1{4-4};
-  let Inst{25-23} = n1{3-1};
-  let Inst{8-8} = n1{0-0};
+  bits <0> n1;
 }
 class Enc_b388cf : OpcodeHexagon {
   bits <5> Ii;
@@ -2661,10 +2606,7 @@ class Enc_b78edd : OpcodeHexagon {
   let Inst{7-1} = Ii{8-2};
   bits <4> Rs16;
   let Inst{19-16} = Rs16{3-0};
-  bits <4> n1;
-  let Inst{28-28} = n1{3-3};
-  let Inst{24-23} = n1{2-1};
-  let Inst{8-8} = n1{0-0};
+  bits <0> n1;
 }
 class Enc_b7fad3 : OpcodeHexagon {
   bits <2> Pv4;
@@ -2715,11 +2657,7 @@ class Enc_b909d2 : OpcodeHexagon {
   let Inst{7-1} = Ii{8-2};
   bits <4> Rs16;
   let Inst{19-16} = Rs16{3-0};
-  bits <7> n1;
-  let Inst{28-28} = n1{6-6};
-  let Inst{25-22} = n1{5-2};
-  let Inst{13-13} = n1{1-1};
-  let Inst{8-8} = n1{0-0};
+  bits <0> n1;
 }
 class Enc_b91167 : OpcodeHexagon {
   bits <2> Ii;
@@ -3335,10 +3273,7 @@ class Enc_e90a15 : OpcodeHexagon {
   let Inst{7-1} = Ii{8-2};
   bits <3> Ns8;
   let Inst{18-16} = Ns8{2-0};
-  bits <4> n1;
-  let Inst{29-29} = n1{3-3};
-  let Inst{26-25} = n1{2-1};
-  let Inst{22-22} = n1{0-0};
+  bits <0> n1;
 }
 class Enc_e957fb : OpcodeHexagon {
   bits <12> Ii;
@@ -3417,8 +3352,7 @@ class Enc_ee5ed0 : OpcodeHexagon {
   let Inst{7-4} = Rs16{3-0};
   bits <4> Rd16;
   let Inst{3-0} = Rd16{3-0};
-  bits <2> n1;
-  let Inst{9-8} = n1{1-0};
+  bits <0> n1;
 }
 class Enc_ef601b : OpcodeHexagon {
   bits <4> Ii;
@@ -3531,11 +3465,7 @@ class Enc_f6fe0b : OpcodeHexagon {
   let Inst{7-1} = Ii{8-2};
   bits <4> Rs16;
   let Inst{19-16} = Rs16{3-0};
-  bits <6> n1;
-  let Inst{28-28} = n1{5-5};
-  let Inst{24-22} = n1{4-2};
-  let Inst{13-13} = n1{1-1};
-  let Inst{8-8} = n1{0-0};
+  bits <0> n1;
 }
 class Enc_f7430e : OpcodeHexagon {
   bits <4> Ii;
@@ -3574,10 +3504,7 @@ class Enc_f7ea77 : OpcodeHexagon {
   let Inst{7-1} = Ii{8-2};
   bits <3> Ns8;
   let Inst{18-16} = Ns8{2-0};
-  bits <4> n1;
-  let Inst{29-29} = n1{3-3};
-  let Inst{26-25} = n1{2-1};
-  let Inst{13-13} = n1{0-0};
+  bits <0> n1;
 }
 class Enc_f82302 : OpcodeHexagon {
   bits <11> Ii;
@@ -3585,10 +3512,7 @@ class Enc_f82302 : OpcodeHexagon {
   let Inst{7-1} = Ii{8-2};
   bits <3> Ns8;
   let Inst{18-16} = Ns8{2-0};
-  bits <4> n1;
-  let Inst{29-29} = n1{3-3};
-  let Inst{26-25} = n1{2-1};
-  let Inst{23-23} = n1{0-0};
+  bits <0> n1;
 }
 class Enc_f82eaf : OpcodeHexagon {
   bits <8> Ii;
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 64bc5ca134c8..45d194e944fb 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -117,9 +117,10 @@ const int Hexagon_ADDI_OFFSET_MIN = -32768;
 // Pin the vtable to this file.
 void HexagonInstrInfo::anchor() {}
 
-HexagonInstrInfo::HexagonInstrInfo(HexagonSubtarget &ST)
-  : HexagonGenInstrInfo(Hexagon::ADJCALLSTACKDOWN, Hexagon::ADJCALLSTACKUP),
-    Subtarget(ST) {}
+HexagonInstrInfo::HexagonInstrInfo(const HexagonSubtarget &ST)
+    : HexagonGenInstrInfo(ST, Hexagon::ADJCALLSTACKDOWN,
+                          Hexagon::ADJCALLSTACKUP),
+      Subtarget(ST) {}
 
 namespace llvm {
 namespace HexagonFUnits {
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
index 086cb1fdd8ac..c17e5277ae2e 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -45,7 +45,7 @@ class HexagonInstrInfo : public HexagonGenInstrInfo {
   virtual void anchor();
 
 public:
-  explicit HexagonInstrInfo(HexagonSubtarget &ST);
+  explicit HexagonInstrInfo(const HexagonSubtarget &ST);
 
   /// TargetInstrInfo overrides.
 
diff --git a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index 72575f2560a3..1057b88530f4 100644
--- a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -42,6 +42,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/RuntimeLibcalls.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
@@ -104,9 +105,6 @@ static cl::opt<bool> HexagonVolatileMemcpy(
 static cl::opt<unsigned> SimplifyLimit("hlir-simplify-limit", cl::init(10000),
   cl::Hidden, cl::desc("Maximum number of simplification steps in HLIR"));
 
-static const char *HexagonVolatileMemcpyName
-  = "hexagon_memcpy_forward_vp4cp4n2";
-
 namespace {
 
 class HexagonLoopIdiomRecognize {
@@ -2246,6 +2244,11 @@ CleanupAndExit:
       Type *PtrTy = PointerType::get(Ctx, 0);
       Type *VoidTy = Type::getVoidTy(Ctx);
       Module *M = Func->getParent();
+
+      // FIXME: This should check if the call is supported
+      StringRef HexagonVolatileMemcpyName =
+          RTLIB::RuntimeLibcallsInfo::getLibcallImplName(
+              RTLIB::impl_hexagon_memcpy_forward_vp4cp4n2);
       FunctionCallee Fn = M->getOrInsertFunction(
           HexagonVolatileMemcpyName, VoidTy, PtrTy, PtrTy, Int32Ty);
 
diff --git a/llvm/lib/Target/Hexagon/HexagonOperands.td b/llvm/lib/Target/Hexagon/HexagonOperands.td
index 5134626c65c7..df5d32c13a73 100644
--- a/llvm/lib/Target/Hexagon/HexagonOperands.td
+++ b/llvm/lib/Target/Hexagon/HexagonOperands.td
@@ -27,9 +27,15 @@ def u9_0ImmPred  : PatLeaf<(i32 imm), [{
 def u64_0ImmOperand : AsmOperandClass { let Name = "u64_0Imm"; let RenderMethod = "addImmOperands"; }
 def u64_0Imm : Operand<i64> { let ParserMatchClass = u64_0ImmOperand; }
 def n1ConstOperand : AsmOperandClass { let Name = "n1Const"; }
-def n1Const : Operand<i32> { let ParserMatchClass = n1ConstOperand; }
+def n1Const : Operand<i32> {
+  let ParserMatchClass = n1ConstOperand;
+  let DecoderMethod = "n1ConstDecoder";
+}
 def sgp10ConstOperand : AsmOperandClass { let Name = "sgp10Const"; }
-def sgp10Const : Operand<i32> { let ParserMatchClass = sgp10ConstOperand; }
+def sgp10Const : Operand<i32> {
+  let ParserMatchClass = sgp10ConstOperand;
+  let DecoderMethod = "sgp10ConstDecoder";
+}
 
 def bblabel : Operand<i32>;
 def bbl     : SDNode<"ISD::BasicBlock", SDTPtrLeaf, [], "BasicBlockSDNode">;
diff --git a/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp b/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp
index c24700b89634..9cd0636306b1 100644
--- a/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp
+++ b/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp
@@ -47,34 +47,100 @@ LLVMInitializeLanaiDisassembler() {
 LanaiDisassembler::LanaiDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
     : MCDisassembler(STI, Ctx) {}
 
-// Forward declare because the autogenerated code will reference this.
-// Definition is further down.
-static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder);
+// clang-format off
+static const unsigned GPRDecoderTable[] = {
+  Lanai::R0,  Lanai::R1,  Lanai::PC,  Lanai::R3,  Lanai::SP,  Lanai::FP,
+  Lanai::R6,  Lanai::R7,  Lanai::RV,  Lanai::R9,  Lanai::RR1, Lanai::RR2,
+  Lanai::R12, Lanai::R13, Lanai::R14, Lanai::RCA, Lanai::R16, Lanai::R17,
+  Lanai::R18, Lanai::R19, Lanai::R20, Lanai::R21, Lanai::R22, Lanai::R23,
+  Lanai::R24, Lanai::R25, Lanai::R26, Lanai::R27, Lanai::R28, Lanai::R29,
+  Lanai::R30, Lanai::R31
+};
+// clang-format on
+
+DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                    uint64_t /*Address*/,
+                                    const MCDisassembler * /*Decoder*/) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = GPRDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
 
 static DecodeStatus decodeRiMemoryValue(MCInst &Inst, unsigned Insn,
                                         uint64_t Address,
-                                        const MCDisassembler *Decoder);
+                                        const MCDisassembler *Decoder) {
+  // RI memory values encoded using 23 bits:
+  //   5 bit register, 16 bit constant
+  unsigned Register = (Insn >> 18) & 0x1f;
+  Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register]));
+  unsigned Offset = (Insn & 0xffff);
+  Inst.addOperand(MCOperand::createImm(SignExtend32<16>(Offset)));
+
+  return MCDisassembler::Success;
+}
 
 static DecodeStatus decodeRrMemoryValue(MCInst &Inst, unsigned Insn,
                                         uint64_t Address,
-                                        const MCDisassembler *Decoder);
+                                        const MCDisassembler *Decoder) {
+  // RR memory values encoded using 20 bits:
+  //   5 bit register, 5 bit register, 2 bit PQ, 3 bit ALU operator, 5 bit JJJJJ
+  unsigned Register = (Insn >> 15) & 0x1f;
+  Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register]));
+  Register = (Insn >> 10) & 0x1f;
+  Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register]));
+
+  return MCDisassembler::Success;
+}
 
 static DecodeStatus decodeSplsValue(MCInst &Inst, unsigned Insn,
                                     uint64_t Address,
-                                    const MCDisassembler *Decoder);
+                                    const MCDisassembler *Decoder) {
+  // RI memory values encoded using 17 bits:
+  //   5 bit register, 10 bit constant
+  unsigned Register = (Insn >> 12) & 0x1f;
+  Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register]));
+  unsigned Offset = (Insn & 0x3ff);
+  Inst.addOperand(MCOperand::createImm(SignExtend32<10>(Offset)));
 
-static DecodeStatus decodeBranch(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                 const MCDisassembler *Decoder);
+  return MCDisassembler::Success;
+}
 
-static DecodeStatus decodePredicateOperand(MCInst &Inst, unsigned Val,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder);
+static bool tryAddingSymbolicOperand(int64_t Value, bool IsBranch,
+                                     uint64_t Address, uint64_t Offset,
+                                     uint64_t Width, MCInst &MI,
+                                     const MCDisassembler *Decoder) {
+  return Decoder->tryAddingSymbolicOperand(MI, Value, Address, IsBranch, Offset,
+                                           Width, /*InstSize=*/0);
+}
+
+static DecodeStatus decodeBranch(MCInst &MI, unsigned Insn, uint64_t Address,
+                                 const MCDisassembler *Decoder) {
+  if (!tryAddingSymbolicOperand(Insn + Address, false, Address, 2, 23, MI,
+                                Decoder))
+    MI.addOperand(MCOperand::createImm(Insn));
+  return MCDisassembler::Success;
+}
 
 static DecodeStatus decodeShiftImm(MCInst &Inst, unsigned Insn,
                                    uint64_t Address,
-                                   const MCDisassembler *Decoder);
+                                   const MCDisassembler *Decoder) {
+  unsigned Offset = (Insn & 0xffff);
+  Inst.addOperand(MCOperand::createImm(SignExtend32<16>(Offset)));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodePredicateOperand(MCInst &Inst, unsigned Val,
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder) {
+  if (Val >= LPCC::UNKNOWN)
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(Val));
+  return MCDisassembler::Success;
+}
 
 #include "LanaiGenDisassemblerTables.inc"
 
@@ -157,95 +223,3 @@ LanaiDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
 
   return MCDisassembler::Fail;
 }
-
-static const unsigned GPRDecoderTable[] = {
-    Lanai::R0,  Lanai::R1,  Lanai::PC,  Lanai::R3,  Lanai::SP,  Lanai::FP,
-    Lanai::R6,  Lanai::R7,  Lanai::RV,  Lanai::R9,  Lanai::RR1, Lanai::RR2,
-    Lanai::R12, Lanai::R13, Lanai::R14, Lanai::RCA, Lanai::R16, Lanai::R17,
-    Lanai::R18, Lanai::R19, Lanai::R20, Lanai::R21, Lanai::R22, Lanai::R23,
-    Lanai::R24, Lanai::R25, Lanai::R26, Lanai::R27, Lanai::R28, Lanai::R29,
-    Lanai::R30, Lanai::R31};
-
-DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                    uint64_t /*Address*/,
-                                    const MCDisassembler * /*Decoder*/) {
-  if (RegNo > 31)
-    return MCDisassembler::Fail;
-
-  unsigned Reg = GPRDecoderTable[RegNo];
-  Inst.addOperand(MCOperand::createReg(Reg));
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus decodeRiMemoryValue(MCInst &Inst, unsigned Insn,
-                                        uint64_t Address,
-                                        const MCDisassembler *Decoder) {
-  // RI memory values encoded using 23 bits:
-  //   5 bit register, 16 bit constant
-  unsigned Register = (Insn >> 18) & 0x1f;
-  Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register]));
-  unsigned Offset = (Insn & 0xffff);
-  Inst.addOperand(MCOperand::createImm(SignExtend32<16>(Offset)));
-
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus decodeRrMemoryValue(MCInst &Inst, unsigned Insn,
-                                        uint64_t Address,
-                                        const MCDisassembler *Decoder) {
-  // RR memory values encoded using 20 bits:
-  //   5 bit register, 5 bit register, 2 bit PQ, 3 bit ALU operator, 5 bit JJJJJ
-  unsigned Register = (Insn >> 15) & 0x1f;
-  Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register]));
-  Register = (Insn >> 10) & 0x1f;
-  Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register]));
-
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus decodeSplsValue(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address,
-                                    const MCDisassembler *Decoder) {
-  // RI memory values encoded using 17 bits:
-  //   5 bit register, 10 bit constant
-  unsigned Register = (Insn >> 12) & 0x1f;
-  Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register]));
-  unsigned Offset = (Insn & 0x3ff);
-  Inst.addOperand(MCOperand::createImm(SignExtend32<10>(Offset)));
-
-  return MCDisassembler::Success;
-}
-
-static bool tryAddingSymbolicOperand(int64_t Value, bool IsBranch,
-                                     uint64_t Address, uint64_t Offset,
-                                     uint64_t Width, MCInst &MI,
-                                     const MCDisassembler *Decoder) {
-  return Decoder->tryAddingSymbolicOperand(MI, Value, Address, IsBranch, Offset,
-                                           Width, /*InstSize=*/0);
-}
-
-static DecodeStatus decodeBranch(MCInst &MI, unsigned Insn, uint64_t Address,
-                                 const MCDisassembler *Decoder) {
-  if (!tryAddingSymbolicOperand(Insn + Address, false, Address, 2, 23, MI,
-                                Decoder))
-    MI.addOperand(MCOperand::createImm(Insn));
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus decodeShiftImm(MCInst &Inst, unsigned Insn,
-                                   uint64_t Address,
-                                   const MCDisassembler *Decoder) {
-  unsigned Offset = (Insn & 0xffff);
-  Inst.addOperand(MCOperand::createImm(SignExtend32<16>(Offset)));
-
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus decodePredicateOperand(MCInst &Inst, unsigned Val,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder) {
-  if (Val >= LPCC::UNKNOWN)
-    return MCDisassembler::Fail;
-  Inst.addOperand(MCOperand::createImm(Val));
-  return MCDisassembler::Success;
-}
-\ No newline at end of file
diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
index 4ca97da16cde..02ed1001cd0d 100644
--- a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
+++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
@@ -13,6 +13,7 @@
 #include "LanaiInstrInfo.h"
 #include "LanaiAluCode.h"
 #include "LanaiCondCode.h"
+#include "LanaiSubtarget.h"
 #include "MCTargetDesc/LanaiBaseInfo.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
@@ -25,8 +26,8 @@ using namespace llvm;
 #define GET_INSTRINFO_CTOR_DTOR
 #include "LanaiGenInstrInfo.inc"
 
-LanaiInstrInfo::LanaiInstrInfo()
-    : LanaiGenInstrInfo(Lanai::ADJCALLSTACKDOWN, Lanai::ADJCALLSTACKUP),
+LanaiInstrInfo::LanaiInstrInfo(const LanaiSubtarget &STI)
+    : LanaiGenInstrInfo(STI, Lanai::ADJCALLSTACKDOWN, Lanai::ADJCALLSTACKUP),
       RegisterInfo() {}
 
 void LanaiInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.h b/llvm/lib/Target/Lanai/LanaiInstrInfo.h
index 07b1e87dc8b2..d98276243dc3 100644
--- a/llvm/lib/Target/Lanai/LanaiInstrInfo.h
+++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.h
@@ -22,11 +22,13 @@
 
 namespace llvm {
 
+class LanaiSubtarget;
+
 class LanaiInstrInfo : public LanaiGenInstrInfo {
   const LanaiRegisterInfo RegisterInfo;
 
 public:
-  LanaiInstrInfo();
+  LanaiInstrInfo(const LanaiSubtarget &STI);
 
   // getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
   // such, whenever a client has an instance of instruction info, it should
diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.td b/llvm/lib/Target/Lanai/LanaiInstrInfo.td
index 1d968fa391c2..e0cd79ca22ff 100644
--- a/llvm/lib/Target/Lanai/LanaiInstrInfo.td
+++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.td
@@ -212,7 +212,6 @@ def MemImmAsmOperand : AsmOperandClass {
   let ParserMethod  = "parseMemoryOperand";
 }
 def MEMi : Operand<i32> {
-  let MIOperandInfo = (ops i32lo21:$offset);
   let ParserMatchClass = MemImmAsmOperand;
   let PrintMethod   = "printMemImmOperand";
 }
@@ -402,7 +401,7 @@ def : Pat<(LanaiSubbF GPR:$Rs1, i32lo16z:$imm),
 def : Pat<(LanaiSubbF GPR:$Rs1, i32hi16:$imm),
           (SUBB_F_I_HI GPR:$Rs1, i32hi16:$imm)>;
 
-def : InstAlias<"mov $src, $dst", (ADD_R GPR:$dst, GPR:$src, R0, 0)>;
+def : InstAlias<"mov $src, $dst", (ADD_R GPR:$dst, GPR:$src, R0, (pred 0))>;
 
 let isAsCheapAsAMove = 1, Rs1 = R0.Num, isCodeGenOnly = 1, H = 1, F = 0,
   isReMaterializable = 1 in
diff --git a/llvm/lib/Target/Lanai/LanaiSubtarget.cpp b/llvm/lib/Target/Lanai/LanaiSubtarget.cpp
index 24aa8553279f..f99e88373edf 100644
--- a/llvm/lib/Target/Lanai/LanaiSubtarget.cpp
+++ b/llvm/lib/Target/Lanai/LanaiSubtarget.cpp
@@ -40,5 +40,5 @@ LanaiSubtarget::LanaiSubtarget(const Triple &TargetTriple, StringRef Cpu,
                                CodeModel::Model /*CodeModel*/,
                                CodeGenOptLevel /*OptLevel*/)
     : LanaiGenSubtargetInfo(TargetTriple, Cpu, /*TuneCPU*/ Cpu, FeatureString),
-      FrameLowering(initializeSubtargetDependencies(Cpu, FeatureString)),
-      TLInfo(TM, *this) {}
+      InstrInfo(initializeSubtargetDependencies(Cpu, FeatureString)),
+      FrameLowering(*this), TLInfo(TM, *this) {}
diff --git a/llvm/lib/Target/Lanai/LanaiSubtarget.h b/llvm/lib/Target/Lanai/LanaiSubtarget.h
index 0a229063ab7b..233c89e881d5 100644
--- a/llvm/lib/Target/Lanai/LanaiSubtarget.h
+++ b/llvm/lib/Target/Lanai/LanaiSubtarget.h
@@ -64,8 +64,8 @@ public:
   }
 
 private:
-  LanaiFrameLowering FrameLowering;
   LanaiInstrInfo InstrInfo;
+  LanaiFrameLowering FrameLowering;
   LanaiTargetLowering TLInfo;
   LanaiSelectionDAGInfo TSInfo;
 };
diff --git a/llvm/lib/Target/LoongArch/LoongArch.td b/llvm/lib/Target/LoongArch/LoongArch.td
index 39948b31fb9b..6497ff999f6f 100644
--- a/llvm/lib/Target/LoongArch/LoongArch.td
+++ b/llvm/lib/Target/LoongArch/LoongArch.td
@@ -39,7 +39,7 @@ def IsLA32
                          "LA32 Basic Integer and Privilege Instruction Set">;
 
 defvar LA32 = DefaultMode;
-def LA64 : HwMode<"+64bit", [IsLA64]>;
+def LA64 : HwMode<[IsLA64]>;
 
 // Single Precision floating point
 def FeatureBasicF
diff --git a/llvm/lib/Target/LoongArch/LoongArchCallingConv.td b/llvm/lib/Target/LoongArch/LoongArchCallingConv.td
index 9844163163a5..7dcf65ce2b82 100644
--- a/llvm/lib/Target/LoongArch/LoongArchCallingConv.td
+++ b/llvm/lib/Target/LoongArch/LoongArchCallingConv.td
@@ -21,3 +21,7 @@ def CSR_ILP32D_LP64D
 
 // Needed for implementation of LoongArchRegisterInfo::getNoPreservedMask()
 def CSR_NoRegs : CalleeSavedRegs<(add)>;
+
+def CSR_MostRegs : CalleeSavedRegs<(add CSR_ILP32S_LP64S,
+                                        (sequence "R%u", 4, 11),
+                                        (sequence "R%u", 16, 19))>;
diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
index 36c3011be2b9..c45975431d83 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
@@ -10,6 +10,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+def NotBoolXor : PatFrags<(ops node:$val),
+                          [(xor node:$val, -1), (xor node:$val, 1)]>;
+
 //===----------------------------------------------------------------------===//
 // LoongArch specific DAG Nodes.
 //===----------------------------------------------------------------------===//
@@ -22,6 +25,9 @@ def SDT_LoongArchFTINT : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>;
 def SDT_LoongArchFRECIPE : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>;
 def SDT_LoongArchFRSQRTE : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>;
 
+// ISD::BRCOND is custom-lowered to LoongArchISD::BRCOND for floating-point
+// comparisons to prevent recursive lowering.
+def loongarch_brcond : SDNode<"LoongArchISD::BRCOND", SDTBrcond, [SDNPHasChain]>;
 def loongarch_movgr2fr_w_la64
     : SDNode<"LoongArchISD::MOVGR2FR_W_LA64", SDT_LoongArchMOVGR2FR_W_LA64>;
 def loongarch_movfr2gr_s_la64
@@ -208,16 +214,18 @@ def : PatFPSetcc<SETUO,  FCMP_CUN_S,  FPR32>;
 def : PatFPSetcc<SETLT,  FCMP_CLT_S,  FPR32>;
 
 multiclass PatFPBrcond<CondCode cc, LAInst CmpInst, RegisterClass RegTy> {
-  def : Pat<(brcond (xor (GRLenVT (setcc RegTy:$fj, RegTy:$fk, cc)), -1),
-                     bb:$imm21),
+  def : Pat<(loongarch_brcond (NotBoolXor (GRLenVT (setcc RegTy:$fj, RegTy:$fk, cc))),
+                              bb:$imm21),
             (BCEQZ (CmpInst RegTy:$fj, RegTy:$fk), bb:$imm21)>;
-  def : Pat<(brcond (GRLenVT (setcc RegTy:$fj, RegTy:$fk, cc)), bb:$imm21),
+  def : Pat<(loongarch_brcond (GRLenVT (setcc RegTy:$fj, RegTy:$fk, cc)), bb:$imm21),
             (BCNEZ (CmpInst RegTy:$fj, RegTy:$fk), bb:$imm21)>;
 }
 
 defm : PatFPBrcond<SETOEQ, FCMP_CEQ_S, FPR32>;
+defm : PatFPBrcond<SETEQ , FCMP_CEQ_S, FPR32>;
 defm : PatFPBrcond<SETOLT, FCMP_CLT_S, FPR32>;
 defm : PatFPBrcond<SETOLE, FCMP_CLE_S, FPR32>;
+defm : PatFPBrcond<SETLE,  FCMP_CLE_S, FPR32>;
 defm : PatFPBrcond<SETONE, FCMP_CNE_S, FPR32>;
 defm : PatFPBrcond<SETO,   FCMP_COR_S, FPR32>;
 defm : PatFPBrcond<SETUEQ, FCMP_CUEQ_S, FPR32>;
diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
index 616640152c8d..965ad8a0a35c 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
@@ -184,8 +184,10 @@ def : PatFPSetcc<SETUO,  FCMP_CUN_D,  FPR64>;
 def : PatFPSetcc<SETLT,  FCMP_CLT_D,  FPR64>;
 
 defm : PatFPBrcond<SETOEQ, FCMP_CEQ_D, FPR64>;
+defm : PatFPBrcond<SETEQ,  FCMP_CEQ_D, FPR64>;
 defm : PatFPBrcond<SETOLT, FCMP_CLT_D, FPR64>;
 defm : PatFPBrcond<SETOLE, FCMP_CLE_D, FPR64>;
+defm : PatFPBrcond<SETLE,  FCMP_CLE_D, FPR64>;
 defm : PatFPBrcond<SETONE, FCMP_CNE_D, FPR64>;
 defm : PatFPBrcond<SETO,   FCMP_COR_D, FPR64>;
 defm : PatFPBrcond<SETUEQ, FCMP_CUEQ_D, FPR64>;
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp
index 71d0263fe376..07e722b9a659 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp
@@ -114,7 +114,7 @@ void LoongArchDAGToDAGISel::Select(SDNode *Node) {
     unsigned SplatBitSize;
     bool HasAnyUndefs;
     unsigned Op;
-    EVT ViaVecTy;
+    EVT ResTy = BVN->getValueType(0);
     bool Is128Vec = BVN->getValueType(0).is128BitVector();
     bool Is256Vec = BVN->getValueType(0).is256BitVector();
 
@@ -129,28 +129,25 @@ void LoongArchDAGToDAGISel::Select(SDNode *Node) {
       break;
     case 8:
       Op = Is256Vec ? LoongArch::PseudoXVREPLI_B : LoongArch::PseudoVREPLI_B;
-      ViaVecTy = Is256Vec ? MVT::v32i8 : MVT::v16i8;
       break;
     case 16:
       Op = Is256Vec ? LoongArch::PseudoXVREPLI_H : LoongArch::PseudoVREPLI_H;
-      ViaVecTy = Is256Vec ? MVT::v16i16 : MVT::v8i16;
       break;
     case 32:
       Op = Is256Vec ? LoongArch::PseudoXVREPLI_W : LoongArch::PseudoVREPLI_W;
-      ViaVecTy = Is256Vec ? MVT::v8i32 : MVT::v4i32;
       break;
     case 64:
       Op = Is256Vec ? LoongArch::PseudoXVREPLI_D : LoongArch::PseudoVREPLI_D;
-      ViaVecTy = Is256Vec ? MVT::v4i64 : MVT::v2i64;
       break;
     }
 
     SDNode *Res;
     // If we have a signed 10 bit integer, we can splat it directly.
     if (SplatValue.isSignedIntN(10)) {
-      SDValue Imm = CurDAG->getTargetConstant(SplatValue, DL,
-                                              ViaVecTy.getVectorElementType());
-      Res = CurDAG->getMachineNode(Op, DL, ViaVecTy, Imm);
+      EVT EleType = ResTy.getVectorElementType();
+      APInt Val = SplatValue.sextOrTrunc(EleType.getSizeInBits());
+      SDValue Imm = CurDAG->getTargetConstant(Val, DL, EleType);
+      Res = CurDAG->getMachineNode(Op, DL, ResTy, Imm);
       ReplaceNode(Node, Res);
       return;
     }
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 5b2d185594f4..634914d3b3fd 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -127,6 +127,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
   setOperationAction(ISD::BR_CC, GRLenVT, Expand);
+  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
   setOperationAction(ISD::SELECT_CC, GRLenVT, Expand);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
   setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, GRLenVT, Expand);
@@ -340,6 +341,14 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
          {MVT::v16i8, MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v8i16, MVT::v4i16,
           MVT::v2i16, MVT::v4i32, MVT::v2i32, MVT::v2i64}) {
       setOperationAction(ISD::TRUNCATE, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
     }
   }
 
@@ -377,6 +386,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
       setOperationAction(ISD::ABDS, VT, Legal);
       setOperationAction(ISD::ABDU, VT, Legal);
+      setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
     }
     for (MVT VT : {MVT::v32i8, MVT::v16i16, MVT::v8i32})
       setOperationAction(ISD::BITREVERSE, VT, Custom);
@@ -413,6 +423,11 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
     setTargetDAGCombine(ISD::BITCAST);
   }
 
+  // Set DAG combine for 'LASX' feature.
+
+  if (Subtarget.hasExtLASX())
+    setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+
   // Compute derived properties from the register classes.
   computeRegisterProperties(Subtarget.getRegisterInfo());
 
@@ -514,6 +529,8 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op,
     return lowerPREFETCH(Op, DAG);
   case ISD::SELECT:
     return lowerSELECT(Op, DAG);
+  case ISD::BRCOND:
+    return lowerBRCOND(Op, DAG);
   case ISD::FP_TO_FP16:
     return lowerFP_TO_FP16(Op, DAG);
   case ISD::FP16_TO_FP:
@@ -522,10 +539,109 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op,
     return lowerFP_TO_BF16(Op, DAG);
   case ISD::BF16_TO_FP:
     return lowerBF16_TO_FP(Op, DAG);
+  case ISD::VECREDUCE_ADD:
+    return lowerVECREDUCE_ADD(Op, DAG);
+  case ISD::VECREDUCE_AND:
+  case ISD::VECREDUCE_OR:
+  case ISD::VECREDUCE_XOR:
+  case ISD::VECREDUCE_SMAX:
+  case ISD::VECREDUCE_SMIN:
+  case ISD::VECREDUCE_UMAX:
+  case ISD::VECREDUCE_UMIN:
+    return lowerVECREDUCE(Op, DAG);
   }
   return SDValue();
 }
 
+// Lower vecreduce_add using vhaddw instructions.
+// For Example:
+//  call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
+// can be lowered to:
+//  VHADDW_D_W    vr0, vr0, vr0
+//  VHADDW_Q_D    vr0, vr0, vr0
+//  VPICKVE2GR_D  a0,  vr0, 0
+//  ADDI_W        a0,  a0,  0
+SDValue LoongArchTargetLowering::lowerVECREDUCE_ADD(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+
+  SDLoc DL(Op);
+  MVT OpVT = Op.getSimpleValueType();
+  SDValue Val = Op.getOperand(0);
+
+  unsigned NumEles = Val.getSimpleValueType().getVectorNumElements();
+  unsigned EleBits = Val.getSimpleValueType().getScalarSizeInBits();
+
+  unsigned LegalVecSize = 128;
+  bool isLASX256Vector =
+      Subtarget.hasExtLASX() && Val.getValueSizeInBits() == 256;
+
+  // Ensure operand type legal or enable it legal.
+  while (!isTypeLegal(Val.getSimpleValueType())) {
+    Val = DAG.WidenVector(Val, DL);
+  }
+
+  // NumEles is designed for iterations count, v4i32 for LSX
+  // and v8i32 for LASX should have the same count.
+  if (isLASX256Vector) {
+    NumEles /= 2;
+    LegalVecSize = 256;
+  }
+
+  for (unsigned i = 1; i < NumEles; i *= 2, EleBits *= 2) {
+    MVT IntTy = MVT::getIntegerVT(EleBits);
+    MVT VecTy = MVT::getVectorVT(IntTy, LegalVecSize / EleBits);
+    Val = DAG.getNode(LoongArchISD::VHADDW, DL, VecTy, Val, Val);
+  }
+
+  if (isLASX256Vector) {
+    SDValue Tmp = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, Val,
+                              DAG.getConstant(2, DL, MVT::i64));
+    Val = DAG.getNode(ISD::ADD, DL, MVT::v4i64, Tmp, Val);
+  }
+
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, Val,
+                     DAG.getConstant(0, DL, Subtarget.getGRLenVT()));
+}
+
+// Lower vecreduce_and/or/xor/[s/u]max/[s/u]min.
+// For Example:
+//  call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a)
+// can be lowered to:
+//  VBSRL_V       vr1, vr0, 8
+//  VMAX_W        vr0, vr1, vr0
+//  VBSRL_V       vr1, vr0, 4
+//  VMAX_W        vr0, vr1, vr0
+//  VPICKVE2GR_W  a0,  vr0, 0
+// For 256 bit vector, it is illegal and will be spilt into
+// two 128 bit vector by default then processed by this.
+SDValue LoongArchTargetLowering::lowerVECREDUCE(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+
+  MVT OpVT = Op.getSimpleValueType();
+  SDValue Val = Op.getOperand(0);
+
+  unsigned NumEles = Val.getSimpleValueType().getVectorNumElements();
+  unsigned EleBits = Val.getSimpleValueType().getScalarSizeInBits();
+
+  // Ensure operand type legal or enable it legal.
+  while (!isTypeLegal(Val.getSimpleValueType())) {
+    Val = DAG.WidenVector(Val, DL);
+  }
+
+  unsigned Opcode = ISD::getVecReduceBaseOpcode(Op.getOpcode());
+  MVT VecTy = Val.getSimpleValueType();
+
+  for (int i = NumEles; i > 1; i /= 2) {
+    SDValue ShiftAmt = DAG.getConstant(i * EleBits / 16, DL, MVT::i64);
+    SDValue Tmp = DAG.getNode(LoongArchISD::VBSRL, DL, VecTy, Val, ShiftAmt);
+    Val = DAG.getNode(Opcode, DL, VecTy, Tmp, Val);
+  }
+
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, Val,
+                     DAG.getConstant(0, DL, Subtarget.getGRLenVT()));
+}
+
 SDValue LoongArchTargetLowering::lowerPREFETCH(SDValue Op,
                                                SelectionDAG &DAG) const {
   unsigned IsData = Op.getConstantOperandVal(4);
@@ -859,6 +975,35 @@ SDValue LoongArchTargetLowering::lowerSELECT(SDValue Op,
   return DAG.getNode(LoongArchISD::SELECT_CC, DL, VT, Ops);
 }
 
+SDValue LoongArchTargetLowering::lowerBRCOND(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  SDValue CondV = Op.getOperand(1);
+  SDLoc DL(Op);
+  MVT GRLenVT = Subtarget.getGRLenVT();
+
+  if (CondV.getOpcode() == ISD::SETCC) {
+    if (CondV.getOperand(0).getValueType() == GRLenVT) {
+      SDValue LHS = CondV.getOperand(0);
+      SDValue RHS = CondV.getOperand(1);
+      ISD::CondCode CCVal = cast<CondCodeSDNode>(CondV.getOperand(2))->get();
+
+      translateSetCCForBranch(DL, LHS, RHS, CCVal, DAG);
+
+      SDValue TargetCC = DAG.getCondCode(CCVal);
+      return DAG.getNode(LoongArchISD::BR_CC, DL, Op.getValueType(),
+                         Op.getOperand(0), LHS, RHS, TargetCC,
+                         Op.getOperand(2));
+    } else if (CondV.getOperand(0).getValueType().isFloatingPoint()) {
+      return DAG.getNode(LoongArchISD::BRCOND, DL, Op.getValueType(),
+                         Op.getOperand(0), CondV, Op.getOperand(2));
+    }
+  }
+
+  return DAG.getNode(LoongArchISD::BR_CC, DL, Op.getValueType(),
+                     Op.getOperand(0), CondV, DAG.getConstant(0, DL, GRLenVT),
+                     DAG.getCondCode(ISD::SETNE), Op.getOperand(2));
+}
+
 SDValue
 LoongArchTargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
                                                SelectionDAG &DAG) const {
@@ -1031,6 +1176,7 @@ static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
 static SDValue lowerVECTOR_SHUFFLEAsShift(const SDLoc &DL, ArrayRef<int> Mask,
                                           MVT VT, SDValue V1, SDValue V2,
                                           SelectionDAG &DAG,
+                                          const LoongArchSubtarget &Subtarget,
                                           const APInt &Zeroable) {
   int Size = Mask.size();
   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
@@ -1057,7 +1203,7 @@ static SDValue lowerVECTOR_SHUFFLEAsShift(const SDLoc &DL, ArrayRef<int> Mask,
          "Illegal integer vector type");
   V = DAG.getBitcast(ShiftVT, V);
   V = DAG.getNode(Opcode, DL, ShiftVT, V,
-                  DAG.getConstant(ShiftAmt, DL, MVT::i64));
+                  DAG.getConstant(ShiftAmt, DL, Subtarget.getGRLenVT()));
   return DAG.getBitcast(VT, V);
 }
 
@@ -1226,10 +1372,10 @@ static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
 ///      (VBSRL_V $v1, $v1, 8)
 ///      (VBSLL_V $v0, $v0, 8)
 ///      (VOR_V $v0, $V0, $v1)
-static SDValue lowerVECTOR_SHUFFLEAsByteRotate(const SDLoc &DL,
-                                               ArrayRef<int> Mask, MVT VT,
-                                               SDValue V1, SDValue V2,
-                                               SelectionDAG &DAG) {
+static SDValue
+lowerVECTOR_SHUFFLEAsByteRotate(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
+                                SDValue V1, SDValue V2, SelectionDAG &DAG,
+                                const LoongArchSubtarget &Subtarget) {
 
   SDValue Lo = V1, Hi = V2;
   int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
@@ -1242,11 +1388,12 @@ static SDValue lowerVECTOR_SHUFFLEAsByteRotate(const SDLoc &DL,
 
   int LoByteShift = 16 - ByteRotation;
   int HiByteShift = ByteRotation;
+  MVT GRLenVT = Subtarget.getGRLenVT();
 
   SDValue LoShift = DAG.getNode(LoongArchISD::VBSLL, DL, ByteVT, Lo,
-                                DAG.getConstant(LoByteShift, DL, MVT::i64));
+                                DAG.getConstant(LoByteShift, DL, GRLenVT));
   SDValue HiShift = DAG.getNode(LoongArchISD::VBSRL, DL, ByteVT, Hi,
-                                DAG.getConstant(HiByteShift, DL, MVT::i64));
+                                DAG.getConstant(HiByteShift, DL, GRLenVT));
   return DAG.getBitcast(VT, DAG.getNode(ISD::OR, DL, ByteVT, LoShift, HiShift));
 }
 
@@ -1351,9 +1498,10 @@ static SDValue lowerVECTOR_SHUFFLEAsZeroOrAnyExtend(const SDLoc &DL,
 ///
 /// When undef's appear in the mask they are treated as if they were whatever
 /// value is necessary in order to fit the above form.
-static SDValue lowerVECTOR_SHUFFLE_VREPLVEI(const SDLoc &DL, ArrayRef<int> Mask,
-                                            MVT VT, SDValue V1, SDValue V2,
-                                            SelectionDAG &DAG) {
+static SDValue
+lowerVECTOR_SHUFFLE_VREPLVEI(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
+                             SDValue V1, SDValue V2, SelectionDAG &DAG,
+                             const LoongArchSubtarget &Subtarget) {
   int SplatIndex = -1;
   for (const auto &M : Mask) {
     if (M != -1) {
@@ -1369,7 +1517,7 @@ static SDValue lowerVECTOR_SHUFFLE_VREPLVEI(const SDLoc &DL, ArrayRef<int> Mask,
   if (fitsRegularPattern<int>(Mask.begin(), 1, Mask.end(), SplatIndex, 0)) {
     APInt Imm(64, SplatIndex);
     return DAG.getNode(LoongArchISD::VREPLVEI, DL, VT, V1,
-                       DAG.getConstant(Imm, DL, MVT::i64));
+                       DAG.getConstant(Imm, DL, Subtarget.getGRLenVT()));
   }
 
   return SDValue();
@@ -1393,9 +1541,10 @@ static SDValue lowerVECTOR_SHUFFLE_VREPLVEI(const SDLoc &DL, ArrayRef<int> Mask,
 ///   (VSHUF4I_H $v0, $v1, 27)
 /// where the 27 comes from:
 ///   3 + (2 << 2) + (1 << 4) + (0 << 6)
-static SDValue lowerVECTOR_SHUFFLE_VSHUF4I(const SDLoc &DL, ArrayRef<int> Mask,
-                                           MVT VT, SDValue V1, SDValue V2,
-                                           SelectionDAG &DAG) {
+static SDValue
+lowerVECTOR_SHUFFLE_VSHUF4I(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
+                            SDValue V1, SDValue V2, SelectionDAG &DAG,
+                            const LoongArchSubtarget &Subtarget) {
 
   unsigned SubVecSize = 4;
   if (VT == MVT::v2f64 || VT == MVT::v2i64)
@@ -1437,13 +1586,15 @@ static SDValue lowerVECTOR_SHUFFLE_VSHUF4I(const SDLoc &DL, ArrayRef<int> Mask,
     Imm |= M & 0x3;
   }
 
+  MVT GRLenVT = Subtarget.getGRLenVT();
+
   // Return vshuf4i.d
   if (VT == MVT::v2f64 || VT == MVT::v2i64)
     return DAG.getNode(LoongArchISD::VSHUF4I, DL, VT, V1, V2,
-                       DAG.getConstant(Imm, DL, MVT::i64));
+                       DAG.getConstant(Imm, DL, GRLenVT));
 
   return DAG.getNode(LoongArchISD::VSHUF4I, DL, VT, V1,
-                     DAG.getConstant(Imm, DL, MVT::i64));
+                     DAG.getConstant(Imm, DL, GRLenVT));
 }
 
 /// Lower VECTOR_SHUFFLE into VPACKEV (if possible).
@@ -1723,7 +1874,8 @@ static SDValue lowerVECTOR_SHUFFLE_VSHUF(const SDLoc &DL, ArrayRef<int> Mask,
 /// This routine breaks down the specific type of 128-bit shuffle and
 /// dispatches to the lowering routines accordingly.
 static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
-                                  SDValue V1, SDValue V2, SelectionDAG &DAG) {
+                                  SDValue V1, SDValue V2, SelectionDAG &DAG,
+                                  const LoongArchSubtarget &Subtarget) {
   assert((VT.SimpleTy == MVT::v16i8 || VT.SimpleTy == MVT::v8i16 ||
           VT.SimpleTy == MVT::v4i32 || VT.SimpleTy == MVT::v2i64 ||
           VT.SimpleTy == MVT::v4f32 || VT.SimpleTy == MVT::v2f64) &&
@@ -1741,9 +1893,11 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
   SDValue Result;
   // TODO: Add more comparison patterns.
   if (V2.isUndef()) {
-    if ((Result = lowerVECTOR_SHUFFLE_VREPLVEI(DL, Mask, VT, V1, V2, DAG)))
+    if ((Result = lowerVECTOR_SHUFFLE_VREPLVEI(DL, Mask, VT, V1, V2, DAG,
+                                               Subtarget)))
       return Result;
-    if ((Result = lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG)))
+    if ((Result =
+             lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG, Subtarget)))
       return Result;
 
     // TODO: This comment may be enabled in the future to better match the
@@ -1766,15 +1920,17 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
   if ((Result = lowerVECTOR_SHUFFLE_VPICKOD(DL, Mask, VT, V1, V2, DAG)))
     return Result;
   if ((VT.SimpleTy == MVT::v2i64 || VT.SimpleTy == MVT::v2f64) &&
-      (Result = lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG)))
+      (Result =
+           lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG, Subtarget)))
     return Result;
   if ((Result = lowerVECTOR_SHUFFLEAsZeroOrAnyExtend(DL, Mask, VT, V1, V2, DAG,
                                                      Zeroable)))
     return Result;
-  if ((Result =
-           lowerVECTOR_SHUFFLEAsShift(DL, Mask, VT, V1, V2, DAG, Zeroable)))
+  if ((Result = lowerVECTOR_SHUFFLEAsShift(DL, Mask, VT, V1, V2, DAG, Subtarget,
+                                           Zeroable)))
     return Result;
-  if ((Result = lowerVECTOR_SHUFFLEAsByteRotate(DL, Mask, VT, V1, V2, DAG)))
+  if ((Result = lowerVECTOR_SHUFFLEAsByteRotate(DL, Mask, VT, V1, V2, DAG,
+                                                Subtarget)))
     return Result;
   if (SDValue NewShuffle = widenShuffleMask(DL, Mask, VT, V1, V2, DAG))
     return NewShuffle;
@@ -1791,10 +1947,10 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
 ///
 /// When undef's appear in the mask they are treated as if they were whatever
 /// value is necessary in order to fit the above form.
-static SDValue lowerVECTOR_SHUFFLE_XVREPLVEI(const SDLoc &DL,
-                                             ArrayRef<int> Mask, MVT VT,
-                                             SDValue V1, SDValue V2,
-                                             SelectionDAG &DAG) {
+static SDValue
+lowerVECTOR_SHUFFLE_XVREPLVEI(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
+                              SDValue V1, SDValue V2, SelectionDAG &DAG,
+                              const LoongArchSubtarget &Subtarget) {
   int SplatIndex = -1;
   for (const auto &M : Mask) {
     if (M != -1) {
@@ -1816,21 +1972,64 @@ static SDValue lowerVECTOR_SHUFFLE_XVREPLVEI(const SDLoc &DL,
                               0)) {
     APInt Imm(64, SplatIndex);
     return DAG.getNode(LoongArchISD::VREPLVEI, DL, VT, V1,
-                       DAG.getConstant(Imm, DL, MVT::i64));
+                       DAG.getConstant(Imm, DL, Subtarget.getGRLenVT()));
   }
 
   return SDValue();
 }
 
 /// Lower VECTOR_SHUFFLE into XVSHUF4I (if possible).
-static SDValue lowerVECTOR_SHUFFLE_XVSHUF4I(const SDLoc &DL, ArrayRef<int> Mask,
-                                            MVT VT, SDValue V1, SDValue V2,
-                                            SelectionDAG &DAG) {
+static SDValue
+lowerVECTOR_SHUFFLE_XVSHUF4I(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
+                             SDValue V1, SDValue V2, SelectionDAG &DAG,
+                             const LoongArchSubtarget &Subtarget) {
   // When the size is less than or equal to 4, lower cost instructions may be
   // used.
   if (Mask.size() <= 4)
     return SDValue();
-  return lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG);
+  return lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG, Subtarget);
+}
+
+/// Lower VECTOR_SHUFFLE into XVPERM (if possible).
+static SDValue lowerVECTOR_SHUFFLE_XVPERM(const SDLoc &DL, ArrayRef<int> Mask,
+                                          MVT VT, SDValue V1, SDValue V2,
+                                          SelectionDAG &DAG) {
+  // LoongArch LASX only have XVPERM_W.
+  if (Mask.size() != 8 || (VT != MVT::v8i32 && VT != MVT::v8f32))
+    return SDValue();
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned HalfSize = NumElts / 2;
+  bool FrontLo = true, FrontHi = true;
+  bool BackLo = true, BackHi = true;
+
+  auto inRange = [](int val, int low, int high) {
+    return (val == -1) || (val >= low && val < high);
+  };
+
+  for (unsigned i = 0; i < HalfSize; ++i) {
+    int Fronti = Mask[i];
+    int Backi = Mask[i + HalfSize];
+
+    FrontLo &= inRange(Fronti, 0, HalfSize);
+    FrontHi &= inRange(Fronti, HalfSize, NumElts);
+    BackLo &= inRange(Backi, 0, HalfSize);
+    BackHi &= inRange(Backi, HalfSize, NumElts);
+  }
+
+  // If both the lower and upper 128-bit parts access only one half of the
+  // vector (either lower or upper), avoid using xvperm.w. The latency of
+  // xvperm.w(3) is higher than using xvshuf(1) and xvori(1).
+  if ((FrontLo || FrontHi) && (BackLo || BackHi))
+    return SDValue();
+
+  SmallVector<SDValue, 8> Masks;
+  for (unsigned i = 0; i < NumElts; ++i)
+    Masks.push_back(Mask[i] == -1 ? DAG.getUNDEF(MVT::i64)
+                                  : DAG.getConstant(Mask[i], DL, MVT::i64));
+  SDValue MaskVec = DAG.getBuildVector(MVT::v8i32, DL, Masks);
+
+  return DAG.getNode(LoongArchISD::XVPERM, DL, VT, V1, MaskVec);
 }
 
 /// Lower VECTOR_SHUFFLE into XVPACKEV (if possible).
@@ -2060,15 +2259,15 @@ static SDValue lowerVECTOR_SHUFFLE_XVSHUF(const SDLoc &DL, ArrayRef<int> Mask,
 /// cases need to be converted to it for processing.
 ///
 /// This function may modify V1, V2 and Mask
-static void canonicalizeShuffleVectorByLane(const SDLoc &DL,
-                                            MutableArrayRef<int> Mask, MVT VT,
-                                            SDValue &V1, SDValue &V2,
-                                            SelectionDAG &DAG) {
+static void canonicalizeShuffleVectorByLane(
+    const SDLoc &DL, MutableArrayRef<int> Mask, MVT VT, SDValue &V1,
+    SDValue &V2, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget) {
 
   enum HalfMaskType { HighLaneTy, LowLaneTy, None };
 
   int MaskSize = Mask.size();
   int HalfSize = Mask.size() / 2;
+  MVT GRLenVT = Subtarget.getGRLenVT();
 
   HalfMaskType preMask = None, postMask = None;
 
@@ -2106,13 +2305,13 @@ static void canonicalizeShuffleVectorByLane(const SDLoc &DL,
   if (preMask == LowLaneTy && postMask == HighLaneTy) {
     V1 = DAG.getBitcast(MVT::v4i64, V1);
     V1 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V1,
-                     DAG.getConstant(0b01001110, DL, MVT::i64));
+                     DAG.getConstant(0b01001110, DL, GRLenVT));
     V1 = DAG.getBitcast(VT, V1);
 
     if (!V2.isUndef()) {
       V2 = DAG.getBitcast(MVT::v4i64, V2);
       V2 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V2,
-                       DAG.getConstant(0b01001110, DL, MVT::i64));
+                       DAG.getConstant(0b01001110, DL, GRLenVT));
       V2 = DAG.getBitcast(VT, V2);
     }
 
@@ -2125,13 +2324,13 @@ static void canonicalizeShuffleVectorByLane(const SDLoc &DL,
   } else if (preMask == LowLaneTy && postMask == LowLaneTy) {
     V1 = DAG.getBitcast(MVT::v4i64, V1);
     V1 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V1,
-                     DAG.getConstant(0b11101110, DL, MVT::i64));
+                     DAG.getConstant(0b11101110, DL, GRLenVT));
     V1 = DAG.getBitcast(VT, V1);
 
     if (!V2.isUndef()) {
       V2 = DAG.getBitcast(MVT::v4i64, V2);
       V2 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V2,
-                       DAG.getConstant(0b11101110, DL, MVT::i64));
+                       DAG.getConstant(0b11101110, DL, GRLenVT));
       V2 = DAG.getBitcast(VT, V2);
     }
 
@@ -2141,13 +2340,13 @@ static void canonicalizeShuffleVectorByLane(const SDLoc &DL,
   } else if (preMask == HighLaneTy && postMask == HighLaneTy) {
     V1 = DAG.getBitcast(MVT::v4i64, V1);
     V1 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V1,
-                     DAG.getConstant(0b01000100, DL, MVT::i64));
+                     DAG.getConstant(0b01000100, DL, GRLenVT));
     V1 = DAG.getBitcast(VT, V1);
 
     if (!V2.isUndef()) {
       V2 = DAG.getBitcast(MVT::v4i64, V2);
       V2 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V2,
-                       DAG.getConstant(0b01000100, DL, MVT::i64));
+                       DAG.getConstant(0b01000100, DL, GRLenVT));
       V2 = DAG.getBitcast(VT, V2);
     }
 
@@ -2209,7 +2408,8 @@ static SDValue lowerVECTOR_SHUFFLEAsLanePermuteAndShuffle(const SDLoc &DL,
 /// This routine breaks down the specific type of 256-bit shuffle and
 /// dispatches to the lowering routines accordingly.
 static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
-                                  SDValue V1, SDValue V2, SelectionDAG &DAG) {
+                                  SDValue V1, SDValue V2, SelectionDAG &DAG,
+                                  const LoongArchSubtarget &Subtarget) {
   assert((VT.SimpleTy == MVT::v32i8 || VT.SimpleTy == MVT::v16i16 ||
           VT.SimpleTy == MVT::v8i32 || VT.SimpleTy == MVT::v4i64 ||
           VT.SimpleTy == MVT::v8f32 || VT.SimpleTy == MVT::v4f64) &&
@@ -2223,7 +2423,7 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
 
   // canonicalize non cross-lane shuffle vector
   SmallVector<int> NewMask(Mask);
-  canonicalizeShuffleVectorByLane(DL, NewMask, VT, V1, V2, DAG);
+  canonicalizeShuffleVectorByLane(DL, NewMask, VT, V1, V2, DAG, Subtarget);
 
   APInt KnownUndef, KnownZero;
   computeZeroableShuffleElements(NewMask, V1, V2, KnownUndef, KnownZero);
@@ -2232,9 +2432,13 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
   SDValue Result;
   // TODO: Add more comparison patterns.
   if (V2.isUndef()) {
-    if ((Result = lowerVECTOR_SHUFFLE_XVREPLVEI(DL, NewMask, VT, V1, V2, DAG)))
+    if ((Result = lowerVECTOR_SHUFFLE_XVREPLVEI(DL, NewMask, VT, V1, V2, DAG,
+                                                Subtarget)))
+      return Result;
+    if ((Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, NewMask, VT, V1, V2, DAG,
+                                               Subtarget)))
       return Result;
-    if ((Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, NewMask, VT, V1, V2, DAG)))
+    if ((Result = lowerVECTOR_SHUFFLE_XVPERM(DL, NewMask, VT, V1, V2, DAG)))
       return Result;
     if ((Result = lowerVECTOR_SHUFFLEAsLanePermuteAndShuffle(DL, NewMask, VT,
                                                              V1, V2, DAG)))
@@ -2259,10 +2463,11 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
     return Result;
   if ((Result = lowerVECTOR_SHUFFLE_XVPICKOD(DL, NewMask, VT, V1, V2, DAG)))
     return Result;
-  if ((Result =
-           lowerVECTOR_SHUFFLEAsShift(DL, NewMask, VT, V1, V2, DAG, Zeroable)))
+  if ((Result = lowerVECTOR_SHUFFLEAsShift(DL, NewMask, VT, V1, V2, DAG,
+                                           Subtarget, Zeroable)))
     return Result;
-  if ((Result = lowerVECTOR_SHUFFLEAsByteRotate(DL, NewMask, VT, V1, V2, DAG)))
+  if ((Result = lowerVECTOR_SHUFFLEAsByteRotate(DL, NewMask, VT, V1, V2, DAG,
+                                                Subtarget)))
     return Result;
   if (SDValue NewShuffle = widenShuffleMask(DL, NewMask, VT, V1, V2, DAG))
     return NewShuffle;
@@ -2314,10 +2519,10 @@ SDValue LoongArchTargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
 
   // For each vector width, delegate to a specialized lowering routine.
   if (VT.is128BitVector())
-    return lower128BitShuffle(DL, OrigMask, VT, V1, V2, DAG);
+    return lower128BitShuffle(DL, OrigMask, VT, V1, V2, DAG, Subtarget);
 
   if (VT.is256BitVector())
-    return lower256BitShuffle(DL, OrigMask, VT, V1, V2, DAG);
+    return lower256BitShuffle(DL, OrigMask, VT, V1, V2, DAG, Subtarget);
 
   return SDValue();
 }
@@ -2414,11 +2619,14 @@ static SDValue lowerBUILD_VECTORAsBroadCastLoad(BuildVectorSDNode *BVOp,
   }
 
   // make sure that this load is valid and only has one user.
-  if (!IdentitySrc || !BVOp->isOnlyUserOf(IdentitySrc.getNode()))
+  if (!IsIdeneity || !IdentitySrc || !BVOp->isOnlyUserOf(IdentitySrc.getNode()))
     return SDValue();
 
-  if (IsIdeneity) {
-    auto *LN = cast<LoadSDNode>(IdentitySrc);
+  auto *LN = cast<LoadSDNode>(IdentitySrc);
+  auto ExtType = LN->getExtensionType();
+
+  if ((ExtType == ISD::EXTLOAD || ExtType == ISD::NON_EXTLOAD) &&
+      VT.getScalarSizeInBits() == LN->getMemoryVT().getScalarSizeInBits()) {
     SDVTList Tys =
         LN->isIndexed()
             ? DAG.getVTList(VT, LN->getBasePtr().getValueType(), MVT::Other)
@@ -2461,6 +2669,16 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op,
         SplatBitSize != 64)
       return SDValue();
 
+    if (SplatBitSize == 64 && !Subtarget.is64Bit()) {
+      // We can only handle 64-bit elements that are within
+      // the signed 32-bit range on 32-bit targets.
+      if (!SplatValue.isSignedIntN(32))
+        return SDValue();
+      if ((Is128Vec && ResTy == MVT::v4i32) ||
+          (Is256Vec && ResTy == MVT::v8i32))
+        return Op;
+    }
+
     EVT ViaVecTy;
 
     switch (SplatBitSize) {
@@ -2609,14 +2827,58 @@ SDValue LoongArchTargetLowering::lowerCONCAT_VECTORS(SDValue Op,
 SDValue
 LoongArchTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
                                                  SelectionDAG &DAG) const {
-  EVT VecTy = Op->getOperand(0)->getValueType(0);
+  MVT EltVT = Op.getSimpleValueType();
+  SDValue Vec = Op->getOperand(0);
+  EVT VecTy = Vec->getValueType(0);
   SDValue Idx = Op->getOperand(1);
-  unsigned NumElts = VecTy.getVectorNumElements();
+  SDLoc DL(Op);
+  MVT GRLenVT = Subtarget.getGRLenVT();
+
+  assert(VecTy.is256BitVector() && "Unexpected EXTRACT_VECTOR_ELT vector type");
 
-  if (isa<ConstantSDNode>(Idx) && Idx->getAsZExtVal() < NumElts)
+  if (isa<ConstantSDNode>(Idx))
     return Op;
 
-  return SDValue();
+  switch (VecTy.getSimpleVT().SimpleTy) {
+  default:
+    llvm_unreachable("Unexpected type");
+  case MVT::v32i8:
+  case MVT::v16i16:
+  case MVT::v4i64:
+  case MVT::v4f64: {
+    // Extract the high half subvector and place it to the low half of a new
+    // vector. It doesn't matter what the high half of the new vector is.
+    EVT HalfTy = VecTy.getHalfNumVectorElementsVT(*DAG.getContext());
+    SDValue VecHi =
+        DAG.getExtractSubvector(DL, HalfTy, Vec, HalfTy.getVectorNumElements());
+    SDValue TmpVec =
+        DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecTy, DAG.getUNDEF(VecTy),
+                    VecHi, DAG.getConstant(0, DL, GRLenVT));
+
+    // Shuffle the origin Vec and the TmpVec using MaskVec, the lowest element
+    // of MaskVec is Idx, the rest do not matter. ResVec[0] will hold the
+    // desired element.
+    SDValue IdxCp =
+        DAG.getNode(LoongArchISD::MOVGR2FR_W_LA64, DL, MVT::f32, Idx);
+    SDValue IdxVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f32, IdxCp);
+    SDValue MaskVec =
+        DAG.getBitcast((VecTy == MVT::v4f64) ? MVT::v4i64 : VecTy, IdxVec);
+    SDValue ResVec =
+        DAG.getNode(LoongArchISD::VSHUF, DL, VecTy, MaskVec, TmpVec, Vec);
+
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ResVec,
+                       DAG.getConstant(0, DL, GRLenVT));
+  }
+  case MVT::v8i32:
+  case MVT::v8f32: {
+    SDValue SplatIdx = DAG.getSplatBuildVector(MVT::v8i32, DL, Idx);
+    SDValue SplatValue =
+        DAG.getNode(LoongArchISD::XVPERM, DL, VecTy, Vec, SplatIdx);
+
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SplatValue,
+                       DAG.getConstant(0, DL, GRLenVT));
+  }
+  }
 }
 
 SDValue
@@ -4740,13 +5002,29 @@ static SDValue performBITCASTCombine(SDNode *N, SelectionDAG &DAG,
     UseLASX = true;
     break;
   };
-  if (UseLASX && !(Subtarget.has32S() && Subtarget.hasExtLASX()))
-    return SDValue();
   Src = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
                       : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
-  Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ;
 
-  SDValue V = DAG.getNode(Opc, DL, MVT::i64, Src);
+  SDValue V;
+  if (!Subtarget.has32S() || !Subtarget.hasExtLASX()) {
+    if (Src.getSimpleValueType() == MVT::v32i8) {
+      SDValue Lo, Hi;
+      std::tie(Lo, Hi) = DAG.SplitVector(Src, DL);
+      Lo = DAG.getNode(LoongArchISD::VMSKLTZ, DL, MVT::i64, Lo);
+      Hi = DAG.getNode(LoongArchISD::VMSKLTZ, DL, MVT::i64, Hi);
+      Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
+                       DAG.getConstant(16, DL, MVT::i8));
+      V = DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
+    } else if (UseLASX) {
+      return SDValue();
+    }
+  }
+
+  if (!V) {
+    Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ;
+    V = DAG.getNode(Opc, DL, MVT::i64, Src);
+  }
+
   EVT T = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
   V = DAG.getZExtOrTrunc(V, DL, T);
   return DAG.getBitcast(VT, V);
@@ -5154,6 +5432,145 @@ static SDValue performBITREV_WCombine(SDNode *N, SelectionDAG &DAG,
                      Src.getOperand(0));
 }
 
+// Perform common combines for BR_CC and SELECT_CC conditions.
+static bool combine_CC(SDValue &LHS, SDValue &RHS, SDValue &CC, const SDLoc &DL,
+                       SelectionDAG &DAG, const LoongArchSubtarget &Subtarget) {
+  ISD::CondCode CCVal = cast<CondCodeSDNode>(CC)->get();
+
+  // As far as arithmetic right shift always saves the sign,
+  // shift can be omitted.
+  // Fold setlt (sra X, N), 0 -> setlt X, 0 and
+  // setge (sra X, N), 0 -> setge X, 0
+  if (isNullConstant(RHS) && (CCVal == ISD::SETGE || CCVal == ISD::SETLT) &&
+      LHS.getOpcode() == ISD::SRA) {
+    LHS = LHS.getOperand(0);
+    return true;
+  }
+
+  if (!ISD::isIntEqualitySetCC(CCVal))
+    return false;
+
+  // Fold ((setlt X, Y), 0, ne) -> (X, Y, lt)
+  // Sometimes the setcc is introduced after br_cc/select_cc has been formed.
+  if (LHS.getOpcode() == ISD::SETCC && isNullConstant(RHS) &&
+      LHS.getOperand(0).getValueType() == Subtarget.getGRLenVT()) {
+    // If we're looking for eq 0 instead of ne 0, we need to invert the
+    // condition.
+    bool Invert = CCVal == ISD::SETEQ;
+    CCVal = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
+    if (Invert)
+      CCVal = ISD::getSetCCInverse(CCVal, LHS.getValueType());
+
+    RHS = LHS.getOperand(1);
+    LHS = LHS.getOperand(0);
+    translateSetCCForBranch(DL, LHS, RHS, CCVal, DAG);
+
+    CC = DAG.getCondCode(CCVal);
+    return true;
+  }
+
+  // Fold ((srl (and X, 1<<C), C), 0, eq/ne) -> ((shl X, GRLen-1-C), 0, ge/lt)
+  if (isNullConstant(RHS) && LHS.getOpcode() == ISD::SRL && LHS.hasOneUse() &&
+      LHS.getOperand(1).getOpcode() == ISD::Constant) {
+    SDValue LHS0 = LHS.getOperand(0);
+    if (LHS0.getOpcode() == ISD::AND &&
+        LHS0.getOperand(1).getOpcode() == ISD::Constant) {
+      uint64_t Mask = LHS0.getConstantOperandVal(1);
+      uint64_t ShAmt = LHS.getConstantOperandVal(1);
+      if (isPowerOf2_64(Mask) && Log2_64(Mask) == ShAmt) {
+        CCVal = CCVal == ISD::SETEQ ? ISD::SETGE : ISD::SETLT;
+        CC = DAG.getCondCode(CCVal);
+
+        ShAmt = LHS.getValueSizeInBits() - 1 - ShAmt;
+        LHS = LHS0.getOperand(0);
+        if (ShAmt != 0)
+          LHS =
+              DAG.getNode(ISD::SHL, DL, LHS.getValueType(), LHS0.getOperand(0),
+                          DAG.getConstant(ShAmt, DL, LHS.getValueType()));
+        return true;
+      }
+    }
+  }
+
+  // (X, 1, setne) -> (X, 0, seteq) if we can prove X is 0/1.
+  // This can occur when legalizing some floating point comparisons.
+  APInt Mask = APInt::getBitsSetFrom(LHS.getValueSizeInBits(), 1);
+  if (isOneConstant(RHS) && DAG.MaskedValueIsZero(LHS, Mask)) {
+    CCVal = ISD::getSetCCInverse(CCVal, LHS.getValueType());
+    CC = DAG.getCondCode(CCVal);
+    RHS = DAG.getConstant(0, DL, LHS.getValueType());
+    return true;
+  }
+
+  return false;
+}
+
+static SDValue performBR_CCCombine(SDNode *N, SelectionDAG &DAG,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   const LoongArchSubtarget &Subtarget) {
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+  SDValue CC = N->getOperand(3);
+  SDLoc DL(N);
+
+  if (combine_CC(LHS, RHS, CC, DL, DAG, Subtarget))
+    return DAG.getNode(LoongArchISD::BR_CC, DL, N->getValueType(0),
+                       N->getOperand(0), LHS, RHS, CC, N->getOperand(4));
+
+  return SDValue();
+}
+
+static SDValue performSELECT_CCCombine(SDNode *N, SelectionDAG &DAG,
+                                       TargetLowering::DAGCombinerInfo &DCI,
+                                       const LoongArchSubtarget &Subtarget) {
+  // Transform
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  SDValue CC = N->getOperand(2);
+  ISD::CondCode CCVal = cast<CondCodeSDNode>(CC)->get();
+  SDValue TrueV = N->getOperand(3);
+  SDValue FalseV = N->getOperand(4);
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+
+  // If the True and False values are the same, we don't need a select_cc.
+  if (TrueV == FalseV)
+    return TrueV;
+
+  // (select (x < 0), y, z)  -> x >> (GRLEN - 1) & (y - z) + z
+  // (select (x >= 0), y, z) -> x >> (GRLEN - 1) & (z - y) + y
+  if (isa<ConstantSDNode>(TrueV) && isa<ConstantSDNode>(FalseV) &&
+      isNullConstant(RHS) &&
+      (CCVal == ISD::CondCode::SETLT || CCVal == ISD::CondCode::SETGE)) {
+    if (CCVal == ISD::CondCode::SETGE)
+      std::swap(TrueV, FalseV);
+
+    int64_t TrueSImm = cast<ConstantSDNode>(TrueV)->getSExtValue();
+    int64_t FalseSImm = cast<ConstantSDNode>(FalseV)->getSExtValue();
+    // Only handle simm12, if it is not in this range, it can be considered as
+    // register.
+    if (isInt<12>(TrueSImm) && isInt<12>(FalseSImm) &&
+        isInt<12>(TrueSImm - FalseSImm)) {
+      SDValue SRA =
+          DAG.getNode(ISD::SRA, DL, VT, LHS,
+                      DAG.getConstant(Subtarget.getGRLen() - 1, DL, VT));
+      SDValue AND =
+          DAG.getNode(ISD::AND, DL, VT, SRA,
+                      DAG.getSignedConstant(TrueSImm - FalseSImm, DL, VT));
+      return DAG.getNode(ISD::ADD, DL, VT, AND, FalseV);
+    }
+
+    if (CCVal == ISD::CondCode::SETGE)
+      std::swap(TrueV, FalseV);
+  }
+
+  if (combine_CC(LHS, RHS, CC, DL, DAG, Subtarget))
+    return DAG.getNode(LoongArchISD::SELECT_CC, DL, N->getValueType(0),
+                       {LHS, RHS, CC, TrueV, FalseV});
+
+  return SDValue();
+}
+
 template <unsigned N>
 static SDValue legalizeIntrinsicImmArg(SDNode *Node, unsigned ImmOp,
                                        SelectionDAG &DAG,
@@ -5828,6 +6245,42 @@ performSPLIT_PAIR_F64Combine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue
+performEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const LoongArchSubtarget &Subtarget) {
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
+  MVT EltVT = N->getSimpleValueType(0);
+  SDValue Vec = N->getOperand(0);
+  EVT VecTy = Vec->getValueType(0);
+  SDValue Idx = N->getOperand(1);
+  unsigned IdxOp = Idx.getOpcode();
+  SDLoc DL(N);
+
+  if (!VecTy.is256BitVector() || isa<ConstantSDNode>(Idx))
+    return SDValue();
+
+  // Combine:
+  //   t2 = truncate t1
+  //   t3 = {zero/sign/any}_extend t2
+  //   t4 = extract_vector_elt t0, t3
+  // to:
+  //   t4 = extract_vector_elt t0, t1
+  if (IdxOp == ISD::ZERO_EXTEND || IdxOp == ISD::SIGN_EXTEND ||
+      IdxOp == ISD::ANY_EXTEND) {
+    SDValue IdxOrig = Idx.getOperand(0);
+    if (!(IdxOrig.getOpcode() == ISD::TRUNCATE))
+      return SDValue();
+
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vec,
+                       IdxOrig.getOperand(0));
+  }
+
+  return SDValue();
+}
+
 SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N,
                                                    DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -5846,6 +6299,10 @@ SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N,
     return performBITCASTCombine(N, DAG, DCI, Subtarget);
   case LoongArchISD::BITREV_W:
     return performBITREV_WCombine(N, DAG, DCI, Subtarget);
+  case LoongArchISD::BR_CC:
+    return performBR_CCCombine(N, DAG, DCI, Subtarget);
+  case LoongArchISD::SELECT_CC:
+    return performSELECT_CCCombine(N, DAG, DCI, Subtarget);
   case ISD::INTRINSIC_WO_CHAIN:
     return performINTRINSIC_WO_CHAINCombine(N, DAG, DCI, Subtarget);
   case LoongArchISD::MOVGR2FR_W_LA64:
@@ -5857,6 +6314,8 @@ SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N,
     return performVMSKLTZCombine(N, DAG, DCI, Subtarget);
   case LoongArchISD::SPLIT_PAIR_F64:
     return performSPLIT_PAIR_F64Combine(N, DAG, DCI, Subtarget);
+  case ISD::EXTRACT_VECTOR_ELT:
+    return performEXTRACT_VECTOR_ELTCombine(N, DAG, DCI, Subtarget);
   }
   return SDValue();
 }
@@ -6575,6 +7034,8 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
     NODE_NAME_CASE(TAIL_MEDIUM)
     NODE_NAME_CASE(TAIL_LARGE)
     NODE_NAME_CASE(SELECT_CC)
+    NODE_NAME_CASE(BR_CC)
+    NODE_NAME_CASE(BRCOND)
     NODE_NAME_CASE(SLL_W)
     NODE_NAME_CASE(SRA_W)
     NODE_NAME_CASE(SRL_W)
@@ -6637,6 +7098,7 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
     NODE_NAME_CASE(VREPLVEI)
     NODE_NAME_CASE(VREPLGR2VR)
     NODE_NAME_CASE(XVPERMI)
+    NODE_NAME_CASE(XVPERM)
     NODE_NAME_CASE(VPICK_SEXT_ELT)
     NODE_NAME_CASE(VPICK_ZEXT_ELT)
     NODE_NAME_CASE(VREPLVE)
@@ -6659,6 +7121,7 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
     NODE_NAME_CASE(XVMSKGEZ)
     NODE_NAME_CASE(XVMSKEQZ)
     NODE_NAME_CASE(XVMSKNEZ)
+    NODE_NAME_CASE(VHADDW)
   }
 #undef NODE_NAME_CASE
   return nullptr;
@@ -7132,6 +7595,7 @@ SDValue LoongArchTargetLowering::LowerFormalArguments(
     llvm_unreachable("Unsupported calling convention");
   case CallingConv::C:
   case CallingConv::Fast:
+  case CallingConv::PreserveMost:
     break;
   case CallingConv::GHC:
     if (!MF.getSubtarget().hasFeature(LoongArch::FeatureBasicF) ||
@@ -7893,7 +8357,7 @@ LoongArchTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
     if (Size < 32 && (AI->getOperation() == AtomicRMWInst::And ||
                       AI->getOperation() == AtomicRMWInst::Or ||
                       AI->getOperation() == AtomicRMWInst::Xor))
-      return AtomicExpansionKind::Expand;
+      return AtomicExpansionKind::CustomExpand;
     if (AI->getOperation() == AtomicRMWInst::Nand || Size < 32)
       return AtomicExpansionKind::CmpXChg;
   }
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index f79ba7450cc3..9d14934a9d36 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -37,6 +37,10 @@ enum NodeType : unsigned {
   // Select
   SELECT_CC,
 
+  // Branch
+  BR_CC,
+  BRCOND,
+
   // 32-bit shifts, directly matching the semantics of the named LoongArch
   // instructions.
   SLL_W,
@@ -141,6 +145,7 @@ enum NodeType : unsigned {
   VREPLVEI,
   VREPLGR2VR,
   XVPERMI,
+  XVPERM,
 
   // Extended vector element extraction
   VPICK_SEXT_ELT,
@@ -177,6 +182,9 @@ enum NodeType : unsigned {
   XVMSKEQZ,
   XVMSKNEZ,
 
+  // Vector Horizontal Addition with Widening‌
+  VHADDW
+
   // Intrinsic operations end =============================================
 };
 } // end namespace LoongArchISD
@@ -382,10 +390,13 @@ private:
   SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerBF16_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerVECREDUCE_ADD(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
 
   bool isFPImmLegal(const APFloat &Imm, EVT VT,
                     bool ForCodeSize) const override;
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
index 26d36f1c5058..c89212dae72d 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
@@ -25,8 +25,8 @@ using namespace llvm;
 #define GET_INSTRINFO_CTOR_DTOR
 #include "LoongArchGenInstrInfo.inc"
 
-LoongArchInstrInfo::LoongArchInstrInfo(LoongArchSubtarget &STI)
-    : LoongArchGenInstrInfo(LoongArch::ADJCALLSTACKDOWN,
+LoongArchInstrInfo::LoongArchInstrInfo(const LoongArchSubtarget &STI)
+    : LoongArchGenInstrInfo(STI, LoongArch::ADJCALLSTACKDOWN,
                             LoongArch::ADJCALLSTACKUP),
       STI(STI) {}
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
index 63b7112b8b40..f25958a32bec 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
@@ -25,7 +25,7 @@ class LoongArchSubtarget;
 
 class LoongArchInstrInfo : public LoongArchGenInstrInfo {
 public:
-  explicit LoongArchInstrInfo(LoongArchSubtarget &STI);
+  explicit LoongArchInstrInfo(const LoongArchSubtarget &STI);
 
   MCInst getNop() const override;
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
index 2b94e65cac0e..20ccc622f58d 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -31,6 +31,10 @@ def SDT_LoongArchSelectCC : SDTypeProfile<1, 5, [SDTCisSameAs<1, 2>,
                                                  SDTCisSameAs<0, 4>,
                                                  SDTCisSameAs<4, 5>]>;
 
+def SDT_LoongArchBrCC : SDTypeProfile<0, 4, [SDTCisSameAs<0, 1>,
+                                             SDTCisVT<2, OtherVT>,
+                                             SDTCisVT<3, OtherVT>]>;
+
 def SDT_LoongArchBStrIns: SDTypeProfile<1, 4, [
   SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<3>,
   SDTCisSameAs<3, 4>
@@ -94,6 +98,8 @@ def loongarch_tail_large : SDNode<"LoongArchISD::TAIL_LARGE", SDT_LoongArchCall,
                                   [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                                    SDNPVariadic]>;
 def loongarch_selectcc : SDNode<"LoongArchISD::SELECT_CC", SDT_LoongArchSelectCC>;
+def loongarch_brcc : SDNode<"LoongArchISD::BR_CC", SDT_LoongArchBrCC,
+                            [SDNPHasChain]>;
 def loongarch_sll_w : SDNode<"LoongArchISD::SLL_W", SDT_LoongArchIntBinOpW>;
 def loongarch_sra_w : SDNode<"LoongArchISD::SRA_W", SDT_LoongArchIntBinOpW>;
 def loongarch_srl_w : SDNode<"LoongArchISD::SRL_W", SDT_LoongArchIntBinOpW>;
@@ -1537,47 +1543,29 @@ def : Pat<(select GPR:$cond, GPR:$t, GPR:$f),
 
 /// Branches and jumps
 
-class BccPat<PatFrag CondOp, LAInst Inst>
-    : Pat<(brcond (GRLenVT (CondOp GPR:$rj, GPR:$rd)), bb:$imm16),
-          (Inst GPR:$rj, GPR:$rd, bb:$imm16)>;
-
-def : BccPat<seteq, BEQ>;
-def : BccPat<setne, BNE>;
-def : BccPat<setlt, BLT>;
-def : BccPat<setge, BGE>;
-def : BccPat<setult, BLTU>;
-def : BccPat<setuge, BGEU>;
-
-class BccSwapPat<PatFrag CondOp, LAInst InstBcc>
-    : Pat<(brcond (GRLenVT (CondOp GPR:$rd, GPR:$rj)), bb:$imm16),
-          (InstBcc GPR:$rj, GPR:$rd, bb:$imm16)>;
-
-// Condition codes that don't have matching LoongArch branch instructions, but
-// are trivially supported by swapping the two input operands.
-def : BccSwapPat<setgt, BLT>;
-def : BccSwapPat<setle, BGE>;
-def : BccSwapPat<setugt, BLTU>;
-def : BccSwapPat<setule, BGEU>;
-
 let Predicates = [Has32S] in {
-// An extra pattern is needed for a brcond without a setcc (i.e. where the
-// condition was calculated elsewhere).
-def : Pat<(brcond GPR:$rj, bb:$imm21), (BNEZ GPR:$rj, bb:$imm21)>;
-
-def : Pat<(brcond (GRLenVT (seteq GPR:$rj, 0)), bb:$imm21),
-          (BEQZ GPR:$rj, bb:$imm21)>;
-def : Pat<(brcond (GRLenVT (setne GPR:$rj, 0)), bb:$imm21),
-          (BNEZ GPR:$rj, bb:$imm21)>;
+class BccZeroPat<CondCode Cond, LAInst Inst>
+    : Pat<(loongarch_brcc (GRLenVT GPR:$rj), 0, Cond, bb:$imm21),
+           (Inst GPR:$rj, bb:$imm21)>;
+
+def : BccZeroPat<SETEQ, BEQZ>;
+def : BccZeroPat<SETNE, BNEZ>;
 } // Predicates = [Has32S]
 
-// An extra pattern is needed for a brcond without a setcc (i.e. where the
-// condition was calculated elsewhere).
-def : Pat<(brcond GPR:$rj, bb:$imm16), (BNE GPR:$rj, R0, bb:$imm16)>;
+multiclass BccPat<CondCode Cond, LAInst Inst> {
+  def : Pat<(loongarch_brcc (GRLenVT GPR:$rj), GPR:$rd, Cond, bb:$imm16),
+            (Inst GPR:$rj, GPR:$rd, bb:$imm16)>;
+  // Explicitly select 0 to R0. The register coalescer doesn't always do it.
+  def : Pat<(loongarch_brcc (GRLenVT GPR:$rj), 0, Cond, bb:$imm16),
+            (Inst GPR:$rj, (GRLenVT R0), bb:$imm16)>;
+}
 
-def : Pat<(brcond (GRLenVT (seteq GPR:$rj, 0)), bb:$imm16),
-          (BEQ GPR:$rj, R0, bb:$imm16)>;
-def : Pat<(brcond (GRLenVT (setne GPR:$rj, 0)), bb:$imm16),
-          (BNE GPR:$rj, R0, bb:$imm16)>;
+defm : BccPat<SETEQ, BEQ>;
+defm : BccPat<SETNE, BNE>;
+defm : BccPat<SETLT, BLT>;
+defm : BccPat<SETGE, BGE>;
+defm : BccPat<SETULT, BLTU>;
+defm : BccPat<SETUGE, BGEU>;
 
 let isBarrier = 1, isBranch = 1, isTerminator = 1 in
 def PseudoBR : Pseudo<(outs), (ins simm26_b:$imm26), [(br bb:$imm26)]>,
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index 0696b11d62ac..a79c01cbe577 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -10,8 +10,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+def SDT_LoongArchXVPERM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+                                               SDTCisVec<2>, SDTCisInt<2>]>;
+
 // Target nodes.
 def loongarch_xvpermi: SDNode<"LoongArchISD::XVPERMI", SDT_LoongArchV1RUimm>;
+def loongarch_xvperm: SDNode<"LoongArchISD::XVPERM", SDT_LoongArchXVPERM>;
 def loongarch_xvmskltz: SDNode<"LoongArchISD::XVMSKLTZ", SDT_LoongArchVMSKCOND>;
 def loongarch_xvmskgez: SDNode<"LoongArchISD::XVMSKGEZ", SDT_LoongArchVMSKCOND>;
 def loongarch_xvmskeqz: SDNode<"LoongArchISD::XVMSKEQZ", SDT_LoongArchVMSKCOND>;
@@ -1186,6 +1190,17 @@ multiclass PatXrXrXr<SDPatternOperator OpNode, string Inst> {
             (!cast<LAInst>(Inst#"_D") LASX256:$xd, LASX256:$xj, LASX256:$xk)>;
 }
 
+multiclass PatXrXrW<SDPatternOperator OpNode, string Inst> {
+  def : Pat<(OpNode(v32i8 LASX256:$vj), (v32i8 LASX256:$vk)),
+            (!cast<LAInst>(Inst#"_H_B") LASX256:$vj, LASX256:$vk)>;
+  def : Pat<(OpNode(v16i16 LASX256:$vj), (v16i16 LASX256:$vk)),
+            (!cast<LAInst>(Inst#"_W_H") LASX256:$vj, LASX256:$vk)>;
+  def : Pat<(OpNode(v8i32 LASX256:$vj), (v8i32 LASX256:$vk)),
+            (!cast<LAInst>(Inst#"_D_W") LASX256:$vj, LASX256:$vk)>;
+  def : Pat<(OpNode(v4i64 LASX256:$vj), (v4i64 LASX256:$vk)),
+            (!cast<LAInst>(Inst#"_Q_D") LASX256:$vj, LASX256:$vk)>;
+}
+
 multiclass PatShiftXrXr<SDPatternOperator OpNode, string Inst> {
   def : Pat<(OpNode (v32i8 LASX256:$xj), (and vsplati8_imm_eq_7,
                                               (v32i8 LASX256:$xk))),
@@ -1513,6 +1528,9 @@ def : Pat<(bswap (v8i32 LASX256:$xj)), (XVSHUF4I_B LASX256:$xj, 0b00011011)>;
 def : Pat<(bswap (v4i64 LASX256:$xj)),
           (XVSHUF4I_W (XVSHUF4I_B LASX256:$xj, 0b00011011), 0b10110001)>;
 
+// XVHADDW_{H_B/W_H/D_W/Q_D}
+defm : PatXrXrW<loongarch_vhaddw, "XVHADDW">;
+
 // XVFADD_{S/D}
 defm : PatXrXrF<fadd, "XVFADD">;
 
@@ -1852,6 +1870,12 @@ def : Pat<(loongarch_xvpermi v4i64:$xj, immZExt8: $ui8),
 def : Pat<(loongarch_xvpermi v4f64:$xj, immZExt8: $ui8),
           (XVPERMI_D v4f64:$xj, immZExt8: $ui8)>;
 
+// XVPERM_W
+def : Pat<(loongarch_xvperm v8i32:$xj, v8i32:$xk),
+          (XVPERM_W v8i32:$xj, v8i32:$xk)>;
+def : Pat<(loongarch_xvperm v8f32:$xj, v8i32:$xk),
+          (XVPERM_W v8f32:$xj, v8i32:$xk)>;
+
 // XVREPLVE0_{W/D}
 def : Pat<(lasxsplatf32 FPR32:$fj),
           (XVREPLVE0_W (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32))>;
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index 3c9defb0366f..eb7120ffb41a 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -22,7 +22,7 @@ def SDT_LoongArchVShuf : SDTypeProfile<1, 3, [SDTCisVec<0>,
 def SDT_LoongArchV2R : SDTypeProfile<1, 2, [SDTCisVec<0>,
                                      SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>]>;
 def SDT_LoongArchV1RUimm: SDTypeProfile<1, 2, [SDTCisVec<0>,
-                                        SDTCisSameAs<0,1>, SDTCisVT<2, i64>]>;
+                                        SDTCisSameAs<0,1>, SDTCisVT<2, GRLenVT>]>;
 def SDT_LoongArchV2RUimm
     : SDTypeProfile<1, 3,
                     [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
@@ -71,6 +71,8 @@ def loongarch_vsrli : SDNode<"LoongArchISD::VSRLI", SDT_LoongArchV1RUimm>;
 def loongarch_vbsll : SDNode<"LoongArchISD::VBSLL", SDT_LoongArchV1RUimm>;
 def loongarch_vbsrl : SDNode<"LoongArchISD::VBSRL", SDT_LoongArchV1RUimm>;
 
+def loongarch_vhaddw : SDNode<"LoongArchISD::VHADDW", SDT_LoongArchV2R>;
+
 def loongarch_vldrepl
     : SDNode<"LoongArchISD::VLDREPL",
              SDT_LoongArchVLDREPL, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
@@ -1364,6 +1366,17 @@ multiclass PatVrVrVr<SDPatternOperator OpNode, string Inst> {
             (!cast<LAInst>(Inst#"_D") LSX128:$vd, LSX128:$vj, LSX128:$vk)>;
 }
 
+multiclass PatVrVrW<SDPatternOperator OpNode, string Inst> {
+  def : Pat<(OpNode(v16i8 LSX128:$vj), (v16i8 LSX128:$vk)),
+            (!cast<LAInst>(Inst#"_H_B") LSX128:$vj, LSX128:$vk)>;
+  def : Pat<(OpNode(v8i16 LSX128:$vj), (v8i16 LSX128:$vk)),
+            (!cast<LAInst>(Inst#"_W_H") LSX128:$vj, LSX128:$vk)>;
+  def : Pat<(OpNode(v4i32 LSX128:$vj), (v4i32 LSX128:$vk)),
+            (!cast<LAInst>(Inst#"_D_W") LSX128:$vj, LSX128:$vk)>;
+  def : Pat<(OpNode(v2i64 LSX128:$vj), (v2i64 LSX128:$vk)),
+            (!cast<LAInst>(Inst#"_Q_D") LSX128:$vj, LSX128:$vk)>;
+}
+
 multiclass PatShiftVrVr<SDPatternOperator OpNode, string Inst> {
   def : Pat<(OpNode (v16i8 LSX128:$vj), (and vsplati8_imm_eq_7,
                                              (v16i8 LSX128:$vk))),
@@ -1709,6 +1722,9 @@ def : Pat<(bswap (v4i32 LSX128:$vj)), (VSHUF4I_B LSX128:$vj, 0b00011011)>;
 def : Pat<(bswap (v2i64 LSX128:$vj)),
           (VSHUF4I_W (VSHUF4I_B LSX128:$vj, 0b00011011), 0b10110001)>;
 
+// VHADDW_{H_B/W_H/D_W/Q_D}
+defm : PatVrVrW<loongarch_vhaddw, "VHADDW">;
+
 // VFADD_{S/D}
 defm : PatVrVrF<fadd, "VFADD">;
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp
index 47fce37ce59f..9c5f8edfaf66 100644
--- a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp
@@ -41,6 +41,8 @@ LoongArchRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
 
   if (MF->getFunction().getCallingConv() == CallingConv::GHC)
     return CSR_NoRegs_SaveList;
+  if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost)
+    return CSR_MostRegs_SaveList;
   switch (Subtarget.getTargetABI()) {
   default:
     llvm_unreachable("Unrecognized ABI");
@@ -63,6 +65,8 @@ LoongArchRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
 
   if (CC == CallingConv::GHC)
     return CSR_NoRegs_RegMask;
+  if (CC == CallingConv::PreserveMost)
+    return CSR_MostRegs_RegMask;
   switch (Subtarget.getTargetABI()) {
   default:
     llvm_unreachable("Unrecognized ABI");
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
index ede5477f04bd..f548a8dd0532 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
@@ -95,4 +95,20 @@ unsigned LoongArchTTIImpl::getPrefetchDistance() const { return 200; }
 
 bool LoongArchTTIImpl::enableWritePrefetching() const { return true; }
 
+bool LoongArchTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {
+  switch (II->getIntrinsicID()) {
+  default:
+    return true;
+  case Intrinsic::vector_reduce_add:
+  case Intrinsic::vector_reduce_and:
+  case Intrinsic::vector_reduce_or:
+  case Intrinsic::vector_reduce_smax:
+  case Intrinsic::vector_reduce_smin:
+  case Intrinsic::vector_reduce_umax:
+  case Intrinsic::vector_reduce_umin:
+  case Intrinsic::vector_reduce_xor:
+    return false;
+  }
+}
+
 // TODO: Implement more hooks to provide TTI machinery for LoongArch.
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h
index d43d2cb0eb12..e3f16c780499 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h
@@ -53,6 +53,8 @@ public:
   unsigned getPrefetchDistance() const override;
   bool enableWritePrefetching() const override;
 
+  bool shouldExpandReduction(const IntrinsicInst *II) const override;
+
   // TODO: Implement more hooks to provide TTI machinery for LoongArch.
 };
 
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
index 35277ce094a7..e5bd1c91edec 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
@@ -26,6 +26,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Compiler.h"
+#include <bitset>
 
 #define GET_INSTRINFO_MC_DESC
 #define ENABLE_INSTR_PREDICATE_VERIFIER
@@ -95,10 +96,81 @@ createLoongArchAsmTargetStreamer(MCStreamer &S, formatted_raw_ostream &OS,
 namespace {
 
 class LoongArchMCInstrAnalysis : public MCInstrAnalysis {
+  int64_t GPRState[31] = {};
+  std::bitset<31> GPRValidMask;
+
+  static bool isGPR(MCRegister Reg) {
+    return Reg >= LoongArch::R0 && Reg <= LoongArch::R31;
+  }
+
+  static unsigned getRegIndex(MCRegister Reg) {
+    assert(isGPR(Reg) && Reg != LoongArch::R0 && "Invalid GPR reg");
+    return Reg - LoongArch::R1;
+  }
+
+  void setGPRState(MCRegister Reg, std::optional<int64_t> Value) {
+    if (Reg == LoongArch::R0)
+      return;
+
+    auto Index = getRegIndex(Reg);
+
+    if (Value) {
+      GPRState[Index] = *Value;
+      GPRValidMask.set(Index);
+    } else {
+      GPRValidMask.reset(Index);
+    }
+  }
+
+  std::optional<int64_t> getGPRState(MCRegister Reg) const {
+    if (Reg == LoongArch::R0)
+      return 0;
+
+    auto Index = getRegIndex(Reg);
+
+    if (GPRValidMask.test(Index))
+      return GPRState[Index];
+    return std::nullopt;
+  }
+
 public:
   explicit LoongArchMCInstrAnalysis(const MCInstrInfo *Info)
       : MCInstrAnalysis(Info) {}
 
+  void resetState() override { GPRValidMask.reset(); }
+
+  void updateState(const MCInst &Inst, uint64_t Addr) override {
+    // Terminators mark the end of a basic block which means the sequentially
+    // next instruction will be the first of another basic block and the current
+    // state will typically not be valid anymore. For calls, we assume all
+    // registers may be clobbered by the callee (TODO: should we take the
+    // calling convention into account?).
+    if (isTerminator(Inst) || isCall(Inst)) {
+      resetState();
+      return;
+    }
+
+    switch (Inst.getOpcode()) {
+    default: {
+      // Clear the state of all defined registers for instructions that we don't
+      // explicitly support.
+      auto NumDefs = Info->get(Inst.getOpcode()).getNumDefs();
+      for (unsigned I = 0; I < NumDefs; ++I) {
+        auto DefReg = Inst.getOperand(I).getReg();
+        if (isGPR(DefReg))
+          setGPRState(DefReg, std::nullopt);
+      }
+      break;
+    }
+    case LoongArch::PCADDU18I:
+      setGPRState(
+          Inst.getOperand(0).getReg(),
+          Addr + SignExtend64<38>(
+                     static_cast<uint64_t>(Inst.getOperand(1).getImm()) << 18));
+      break;
+    }
+  }
+
   bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size,
                       uint64_t &Target) const override {
     unsigned NumOps = Inst.getNumOperands();
@@ -108,6 +180,14 @@ public:
       return true;
     }
 
+    if (Inst.getOpcode() == LoongArch::JIRL) {
+      if (auto TargetRegState = getGPRState(Inst.getOperand(1).getReg())) {
+        Target = *TargetRegState + Inst.getOperand(2).getImm();
+        return true;
+      }
+      return false;
+    }
+
     return false;
   }
 
diff --git a/llvm/lib/Target/M68k/Disassembler/M68kDisassembler.cpp b/llvm/lib/Target/M68k/Disassembler/M68kDisassembler.cpp
index d3ad65390143..4992f1abe5a0 100644
--- a/llvm/lib/Target/M68k/Disassembler/M68kDisassembler.cpp
+++ b/llvm/lib/Target/M68k/Disassembler/M68kDisassembler.cpp
@@ -107,6 +107,18 @@ static DecodeStatus DecodeFPCSCRegisterClass(MCInst &Inst, uint64_t RegNo,
 }
 #define DecodeFPICRegisterClass DecodeFPCSCRegisterClass
 
+static DecodeStatus DecodeCCRCRegisterClass(MCInst &Inst,
+                                            const MCDisassembler *Decoder) {
+  Inst.addOperand(MCOperand::createReg(M68k::CCR));
+  return DecodeStatus::Success;
+}
+
+static DecodeStatus DecodeSRCRegisterClass(MCInst &Inst,
+                                           const MCDisassembler *Decoder) {
+  Inst.addOperand(MCOperand::createReg(M68k::SR));
+  return DecodeStatus::Success;
+}
+
 static DecodeStatus DecodeImm32(MCInst &Inst, uint64_t Imm, uint64_t Address,
                                 const void *Decoder) {
   Inst.addOperand(MCOperand::createImm(M68k::swapWord<uint32_t>(Imm)));
diff --git a/llvm/lib/Target/M68k/M68kInstrAtomics.td b/llvm/lib/Target/M68k/M68kInstrAtomics.td
index 867afbefe68f..b2b64ca85322 100644
--- a/llvm/lib/Target/M68k/M68kInstrAtomics.td
+++ b/llvm/lib/Target/M68k/M68kInstrAtomics.td
@@ -67,7 +67,8 @@ class MxCASARIDOp<bits<2> size_encoding, MxType type>
              "cas."#type.Prefix#" $dc, $du, $mem"> {
   let Inst = (ascend
                 (descend 0b00001, size_encoding, 0b011, MxEncAddrMode_p<"mem">.EA),
-                (descend 0b0000000, (operand "$du", 3), 0b000, (operand "$dc", 3))
+                (descend 0b0000000, (operand "$du", 3), 0b000, (operand "$dc", 3)),
+                MxEncAddrMode_p<"mem">.Supplement
               );
   let Constraints = "$out = $dc";
   let mayLoad = 1;
@@ -84,7 +85,8 @@ class MxCASARIIOp<bits<2> size_encoding, MxType type>
              "cas."#type.Prefix#" $dc, $du, $mem"> {
   let Inst = (ascend
                 (descend 0b00001, size_encoding, 0b011, MxEncAddrMode_f<"mem">.EA),
-                (descend 0b0000000, (operand "$du", 3), 0b000, (operand "$dc", 3))
+                (descend 0b0000000, (operand "$du", 3), 0b000, (operand "$dc", 3)),
+                MxEncAddrMode_f<"mem">.Supplement
               );
   let Constraints = "$out = $dc";
   let mayLoad = 1;
@@ -100,8 +102,9 @@ class MxCASALOp<bits<2> size_encoding, MxType type>
              (ins type.ROp:$dc, type.ROp:$du, !cast<MxMemOp>("MxAL"#type.Size):$mem),
              "cas."#type.Prefix#" $dc, $du, $mem"> {
   let Inst = (ascend
-                (descend 0b00001, size_encoding, 0b011, MxEncAddrMode_abs<"mem">.EA),
-                (descend 0b0000000, (operand "$du", 3), 0b000, (operand "$dc", 3))
+                (descend 0b00001, size_encoding, 0b011, MxEncAddrMode_abs<"mem", true>.EA),
+                (descend 0b0000000, (operand "$du", 3), 0b000, (operand "$dc", 3)),
+                MxEncAddrMode_abs<"mem", true>.Supplement
               );
   let Constraints = "$out = $dc";
   let mayLoad = 1;
diff --git a/llvm/lib/Target/M68k/M68kInstrInfo.cpp b/llvm/lib/Target/M68k/M68kInstrInfo.cpp
index 21e9319aaf0b..c6be190bd124 100644
--- a/llvm/lib/Target/M68k/M68kInstrInfo.cpp
+++ b/llvm/lib/Target/M68k/M68kInstrInfo.cpp
@@ -43,7 +43,7 @@ using namespace llvm;
 void M68kInstrInfo::anchor() {}
 
 M68kInstrInfo::M68kInstrInfo(const M68kSubtarget &STI)
-    : M68kGenInstrInfo(M68k::ADJCALLSTACKDOWN, M68k::ADJCALLSTACKUP, 0,
+    : M68kGenInstrInfo(STI, M68k::ADJCALLSTACKDOWN, M68k::ADJCALLSTACKUP, 0,
                        M68k::RET),
       Subtarget(STI), RI(STI) {}
 
diff --git a/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp b/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
index 38d22eda5f17..a31c8ec1b2bb 100644
--- a/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
+++ b/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
@@ -36,7 +36,6 @@ namespace {
 
 /// Parses MSP430 assembly from a stream.
 class MSP430AsmParser : public MCTargetAsmParser {
-  const MCSubtargetInfo &STI;
   MCAsmParser &Parser;
   const MCRegisterInfo *MRI;
 
@@ -79,7 +78,7 @@ class MSP430AsmParser : public MCTargetAsmParser {
 public:
   MSP430AsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
                   const MCInstrInfo &MII, const MCTargetOptions &Options)
-      : MCTargetAsmParser(Options, STI, MII), STI(STI), Parser(Parser) {
+      : MCTargetAsmParser(Options, STI, MII), Parser(Parser) {
     MCAsmParserExtension::Initialize(Parser);
     MRI = getContext().getRegisterInfo();
 
@@ -264,7 +263,7 @@ bool MSP430AsmParser::matchAndEmitInstruction(SMLoc Loc, unsigned &Opcode,
   switch (MatchResult) {
   case Match_Success:
     Inst.setLoc(Loc);
-    Out.emitInstruction(Inst, STI);
+    Out.emitInstruction(Inst, *STI);
     return false;
   case Match_MnemonicFail:
     return Error(Loc, "invalid instruction mnemonic");
diff --git a/llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp b/llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp
index c8094a8eeb36..e6666e8cafdf 100644
--- a/llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp
+++ b/llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp
@@ -103,15 +103,6 @@ static DecodeStatus DecodeGR16RegisterClass(MCInst &MI, uint64_t RegNo,
 }
 
 static DecodeStatus DecodeCGImm(MCInst &MI, uint64_t Bits, uint64_t Address,
-                                const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeMemOperand(MCInst &MI, uint64_t Bits,
-                                     uint64_t Address,
-                                     const MCDisassembler *Decoder);
-
-#include "MSP430GenDisassemblerTables.inc"
-
-static DecodeStatus DecodeCGImm(MCInst &MI, uint64_t Bits, uint64_t Address,
                                 const MCDisassembler *Decoder) {
   int64_t Imm;
   switch (Bits) {
@@ -142,6 +133,8 @@ static DecodeStatus DecodeMemOperand(MCInst &MI, uint64_t Bits,
   return MCDisassembler::Success;
 }
 
+#include "MSP430GenDisassemblerTables.inc"
+
 enum AddrMode {
   amInvalid = 0,
   amRegister,
diff --git a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
index 6da5e66be4ad..5653099431b1 100644
--- a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -154,9 +154,9 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
       const RTLIB::LibcallImpl Impl;
     } LibraryCalls[] = {
         // Integer Multiply - EABI Table 9
-        {RTLIB::MUL_I16, RTLIB::__mspabi_mpyi_hw},
-        {RTLIB::MUL_I32, RTLIB::__mspabi_mpyl_hw},
-        {RTLIB::MUL_I64, RTLIB::__mspabi_mpyll_hw},
+        {RTLIB::MUL_I16, RTLIB::impl___mspabi_mpyi_hw},
+        {RTLIB::MUL_I32, RTLIB::impl___mspabi_mpyl_hw},
+        {RTLIB::MUL_I64, RTLIB::impl___mspabi_mpyll_hw},
         // TODO The __mspabi_mpysl*_hw functions ARE implemented in libgcc
         // TODO The __mspabi_mpyul*_hw functions ARE implemented in libgcc
     };
@@ -169,9 +169,9 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
       const RTLIB::LibcallImpl Impl;
     } LibraryCalls[] = {
         // Integer Multiply - EABI Table 9
-        {RTLIB::MUL_I16, RTLIB::__mspabi_mpyi_hw},
-        {RTLIB::MUL_I32, RTLIB::__mspabi_mpyl_hw32},
-        {RTLIB::MUL_I64, RTLIB::__mspabi_mpyll_hw32},
+        {RTLIB::MUL_I16, RTLIB::impl___mspabi_mpyi_hw},
+        {RTLIB::MUL_I32, RTLIB::impl___mspabi_mpyl_hw32},
+        {RTLIB::MUL_I64, RTLIB::impl___mspabi_mpyll_hw32},
         // TODO The __mspabi_mpysl*_hw32 functions ARE implemented in libgcc
         // TODO The __mspabi_mpyul*_hw32 functions ARE implemented in libgcc
     };
@@ -184,9 +184,9 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
       const RTLIB::LibcallImpl Impl;
     } LibraryCalls[] = {
         // Integer Multiply - EABI Table 9
-        {RTLIB::MUL_I16, RTLIB::__mspabi_mpyi_f5hw},
-        {RTLIB::MUL_I32, RTLIB::__mspabi_mpyl_f5hw},
-        {RTLIB::MUL_I64, RTLIB::__mspabi_mpyll_f5hw},
+        {RTLIB::MUL_I16, RTLIB::impl___mspabi_mpyi_f5hw},
+        {RTLIB::MUL_I32, RTLIB::impl___mspabi_mpyl_f5hw},
+        {RTLIB::MUL_I64, RTLIB::impl___mspabi_mpyll_f5hw},
         // TODO The __mspabi_mpysl*_f5hw functions ARE implemented in libgcc
         // TODO The __mspabi_mpyul*_f5hw functions ARE implemented in libgcc
     };
@@ -199,9 +199,9 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
       const RTLIB::LibcallImpl Impl;
     } LibraryCalls[] = {
         // Integer Multiply - EABI Table 9
-        {RTLIB::MUL_I16, RTLIB::__mspabi_mpyi},
-        {RTLIB::MUL_I32, RTLIB::__mspabi_mpyl},
-        {RTLIB::MUL_I64, RTLIB::__mspabi_mpyll},
+        {RTLIB::MUL_I16, RTLIB::impl___mspabi_mpyi},
+        {RTLIB::MUL_I32, RTLIB::impl___mspabi_mpyl},
+        {RTLIB::MUL_I64, RTLIB::impl___mspabi_mpyll},
         // The __mspabi_mpysl* functions are NOT implemented in libgcc
         // The __mspabi_mpyul* functions are NOT implemented in libgcc
     };
diff --git a/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp b/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
index 8bc6387e6a7e..65b4820752c9 100644
--- a/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -12,6 +12,7 @@
 
 #include "MSP430InstrInfo.h"
 #include "MSP430.h"
+#include "MSP430Subtarget.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -24,9 +25,9 @@ using namespace llvm;
 // Pin the vtable to this file.
 void MSP430InstrInfo::anchor() {}
 
-MSP430InstrInfo::MSP430InstrInfo(MSP430Subtarget &STI)
-  : MSP430GenInstrInfo(MSP430::ADJCALLSTACKDOWN, MSP430::ADJCALLSTACKUP),
-    RI() {}
+MSP430InstrInfo::MSP430InstrInfo(const MSP430Subtarget &STI)
+    : MSP430GenInstrInfo(STI, MSP430::ADJCALLSTACKDOWN, MSP430::ADJCALLSTACKUP),
+      RI() {}
 
 void MSP430InstrInfo::storeRegToStackSlot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
diff --git a/llvm/lib/Target/MSP430/MSP430InstrInfo.h b/llvm/lib/Target/MSP430/MSP430InstrInfo.h
index 58be64336f26..316c136890bf 100644
--- a/llvm/lib/Target/MSP430/MSP430InstrInfo.h
+++ b/llvm/lib/Target/MSP430/MSP430InstrInfo.h
@@ -27,7 +27,7 @@ class MSP430InstrInfo : public MSP430GenInstrInfo {
   const MSP430RegisterInfo RI;
   virtual void anchor();
 public:
-  explicit MSP430InstrInfo(MSP430Subtarget &STI);
+  explicit MSP430InstrInfo(const MSP430Subtarget &STI);
 
   /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
   /// such, whenever a client has an instance of instruction info, it should
diff --git a/llvm/lib/Target/Mips/CMakeLists.txt b/llvm/lib/Target/Mips/CMakeLists.txt
index 21d1765107ae..4a2277e9a80d 100644
--- a/llvm/lib/Target/Mips/CMakeLists.txt
+++ b/llvm/lib/Target/Mips/CMakeLists.txt
@@ -6,7 +6,8 @@ tablegen(LLVM MipsGenAsmMatcher.inc -gen-asm-matcher)
 tablegen(LLVM MipsGenAsmWriter.inc -gen-asm-writer)
 tablegen(LLVM MipsGenCallingConv.inc -gen-callingconv)
 tablegen(LLVM MipsGenDAGISel.inc -gen-dag-isel)
-tablegen(LLVM MipsGenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM MipsGenDisassemblerTables.inc -gen-disassembler
+              -ignore-non-decodable-operands)
 tablegen(LLVM MipsGenFastISel.inc -gen-fast-isel)
 tablegen(LLVM MipsGenGlobalISel.inc -gen-global-isel)
 tablegen(LLVM MipsGenPostLegalizeGICombiner.inc -gen-global-isel-combiner
diff --git a/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index 0c98c4da2ede..fa6cc0e3f018 100644
--- a/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -78,451 +78,216 @@ public:
 
 } // end anonymous namespace
 
-// Forward declare these because the autogenerated code will reference them.
-// Definitions are further down.
-static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeCPU16RegsRegisterClass(MCInst &Inst, unsigned RegNo,
-                                                 uint64_t Address,
-                                                 const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeGPRMM16RegisterClass(MCInst &Inst, unsigned RegNo,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder);
-
-static DecodeStatus
-DecodeGPRMM16ZeroRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
-                               const MCDisassembler *Decoder);
-
-static DecodeStatus
-DecodeGPRMM16MovePRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
-                                const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const MCDisassembler *Decoder);
-
-static DecodeStatus DecodePtrRegisterClass(MCInst &Inst, unsigned Insn,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeDSPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                            uint64_t Address,
-                                            const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeFGR64RegisterClass(MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeFGR32RegisterClass(MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeCCRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder);
+static MCDisassembler *createMipsDisassembler(const Target &T,
+                                              const MCSubtargetInfo &STI,
+                                              MCContext &Ctx) {
+  return new MipsDisassembler(STI, Ctx, true);
+}
 
-static DecodeStatus DecodeFCCRegisterClass(MCInst &Inst, unsigned RegNo,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder);
+static MCDisassembler *createMipselDisassembler(const Target &T,
+                                                const MCSubtargetInfo &STI,
+                                                MCContext &Ctx) {
+  return new MipsDisassembler(STI, Ctx, false);
+}
 
-static DecodeStatus DecodeFGRCCRegisterClass(MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const MCDisassembler *Decoder);
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeMipsDisassembler() {
+  // Register the disassembler.
+  TargetRegistry::RegisterMCDisassembler(getTheMipsTarget(),
+                                         createMipsDisassembler);
+  TargetRegistry::RegisterMCDisassembler(getTheMipselTarget(),
+                                         createMipselDisassembler);
+  TargetRegistry::RegisterMCDisassembler(getTheMips64Target(),
+                                         createMipsDisassembler);
+  TargetRegistry::RegisterMCDisassembler(getTheMips64elTarget(),
+                                         createMipselDisassembler);
+}
 
-static DecodeStatus DecodeHWRegsRegisterClass(MCInst &Inst, unsigned Insn,
+static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo) {
+  const MCRegisterInfo *RegInfo = D->getContext().getRegisterInfo();
+  return RegInfo->getRegClass(RC).getRegister(RegNo);
+}
+static DecodeStatus DecodeHWRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                               uint64_t Address,
-                                              const MCDisassembler *Decoder);
+                                              const MCDisassembler *Decoder) {
+  // Currently only hardware register 29 is supported.
+  if (RegNo != 29)
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createReg(Mips::HWR29));
+  return MCDisassembler::Success;
+}
 
 static DecodeStatus DecodeAFGR64RegisterClass(MCInst &Inst, unsigned RegNo,
                                               uint64_t Address,
-                                              const MCDisassembler *Decoder);
+                                              const MCDisassembler *Decoder) {
+  if (RegNo > 30 || RegNo % 2)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::AFGR64RegClassID, RegNo / 2);
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
 
 static DecodeStatus DecodeACC64DSPRegisterClass(MCInst &Inst, unsigned RegNo,
                                                 uint64_t Address,
-                                                const MCDisassembler *Decoder);
+                                                const MCDisassembler *Decoder) {
+  if (RegNo >= 4)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::ACC64DSPRegClassID, RegNo);
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
 
 static DecodeStatus DecodeHI32DSPRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const MCDisassembler *Decoder);
+                                               const MCDisassembler *Decoder) {
+  if (RegNo >= 4)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::HI32DSPRegClassID, RegNo);
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
 
 static DecodeStatus DecodeLO32DSPRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const MCDisassembler *Decoder);
+                                               const MCDisassembler *Decoder) {
+  if (RegNo >= 4)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::LO32DSPRegClassID, RegNo);
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
 
 static DecodeStatus DecodeMSA128BRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const MCDisassembler *Decoder);
+                                               const MCDisassembler *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::MSA128BRegClassID, RegNo);
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
 
 static DecodeStatus DecodeMSA128HRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const MCDisassembler *Decoder);
+                                               const MCDisassembler *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::MSA128HRegClassID, RegNo);
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
 
 static DecodeStatus DecodeMSA128WRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const MCDisassembler *Decoder);
+                                               const MCDisassembler *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::MSA128WRegClassID, RegNo);
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
 
 static DecodeStatus DecodeMSA128DRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const MCDisassembler *Decoder);
+                                               const MCDisassembler *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::MSA128DRegClassID, RegNo);
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
 
 static DecodeStatus DecodeMSACtrlRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
-                                               const MCDisassembler *Decoder);
+                                               const MCDisassembler *Decoder) {
+  if (RegNo > 7)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::MSACtrlRegClassID, RegNo);
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
 
 static DecodeStatus DecodeCOP0RegisterClass(MCInst &Inst, unsigned RegNo,
                                             uint64_t Address,
-                                            const MCDisassembler *Decoder);
+                                            const MCDisassembler *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::COP0RegClassID, RegNo);
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
 
 static DecodeStatus DecodeCOP2RegisterClass(MCInst &Inst, unsigned RegNo,
                                             uint64_t Address,
-                                            const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeBranchTarget(MCInst &Inst, unsigned Offset,
-                                       uint64_t Address,
-                                       const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeBranchTarget1SImm16(MCInst &Inst, unsigned Offset,
-                                              uint64_t Address,
-                                              const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeJumpTarget(MCInst &Inst, unsigned Insn,
-                                     uint64_t Address,
-                                     const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeBranchTarget21(MCInst &Inst, unsigned Offset,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeBranchTarget21MM(MCInst &Inst, unsigned Offset,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeBranchTarget26(MCInst &Inst, unsigned Offset,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder);
-
-// DecodeBranchTarget7MM - Decode microMIPS branch offset, which is
-// shifted left by 1 bit.
-static DecodeStatus DecodeBranchTarget7MM(MCInst &Inst, unsigned Offset,
-                                          uint64_t Address,
-                                          const MCDisassembler *Decoder);
-
-// DecodeBranchTarget10MM - Decode microMIPS branch offset, which is
-// shifted left by 1 bit.
-static DecodeStatus DecodeBranchTarget10MM(MCInst &Inst, unsigned Offset,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder);
-
-// DecodeBranchTargetMM - Decode microMIPS branch offset, which is
-// shifted left by 1 bit.
-static DecodeStatus DecodeBranchTargetMM(MCInst &Inst, unsigned Offset,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder);
-
-// DecodeBranchTarget26MM - Decode microMIPS branch offset, which is
-// shifted left by 1 bit.
-static DecodeStatus DecodeBranchTarget26MM(MCInst &Inst, unsigned Offset,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder);
-
-// DecodeJumpTargetMM - Decode microMIPS jump target, which is
-// shifted left by 1 bit.
-static DecodeStatus DecodeJumpTargetMM(MCInst &Inst, unsigned Insn,
-                                       uint64_t Address,
-                                       const MCDisassembler *Decoder);
-
-// DecodeJumpTargetXMM - Decode microMIPS jump and link exchange target,
-// which is shifted left by 2 bit.
-static DecodeStatus DecodeJumpTargetXMM(MCInst &Inst, unsigned Insn,
-                                        uint64_t Address,
-                                        const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeMem(MCInst &Inst, unsigned Insn, uint64_t Address,
-                              const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeMemEVA(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                 const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeLoadByte15(MCInst &Inst, unsigned Insn,
-                                     uint64_t Address,
-                                     const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeCacheOp(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                  const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeCacheeOp_CacheOpR6(MCInst &Inst, unsigned Insn,
-                                             uint64_t Address,
-                                             const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeCacheOpMM(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address,
-                                    const MCDisassembler *Decoder);
-
-static DecodeStatus DecodePrefeOpMM(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address,
-                                    const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeSyncI(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeSyncI_MM(MCInst &Inst, unsigned Insn,
-                                   uint64_t Address,
-                                   const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeSynciR6(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                  const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeMSA128Mem(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address,
-                                    const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeMemMMImm4(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address,
-                                    const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeMemMMSPImm5Lsl2(MCInst &Inst, unsigned Insn,
-                                          uint64_t Address,
-                                          const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeMemMMGPImm7Lsl2(MCInst &Inst, unsigned Insn,
-                                          uint64_t Address,
-                                          const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeMemMMReglistImm4Lsl2(MCInst &Inst, unsigned Insn,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeMemMMImm9(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address,
-                                    const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeMemMMImm12(MCInst &Inst, unsigned Insn,
-                                     uint64_t Address,
-                                     const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeMemMMImm16(MCInst &Inst, unsigned Insn,
-                                     uint64_t Address,
-                                     const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeFMem(MCInst &Inst, unsigned Insn, uint64_t Address,
-                               const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeFMemMMR2(MCInst &Inst, unsigned Insn,
-                                   uint64_t Address,
-                                   const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeFMem2(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeFMem3(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeFMemCop2R6(MCInst &Inst, unsigned Insn,
-                                     uint64_t Address,
-                                     const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeFMemCop2MMR6(MCInst &Inst, unsigned Insn,
-                                       uint64_t Address,
-                                       const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeSpecial3LlSc(MCInst &Inst, unsigned Insn,
-                                       uint64_t Address,
-                                       const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeAddiur2Simm7(MCInst &Inst, unsigned Value,
-                                       uint64_t Address,
-                                       const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeLi16Imm(MCInst &Inst, unsigned Value,
-                                  uint64_t Address,
-                                  const MCDisassembler *Decoder);
-
-static DecodeStatus DecodePOOL16BEncodedField(MCInst &Inst, unsigned Value,
-                                              uint64_t Address,
-                                              const MCDisassembler *Decoder);
+                                            const MCDisassembler *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
 
-template <unsigned Bits, int Offset, int Scale>
-static DecodeStatus DecodeUImmWithOffsetAndScale(MCInst &Inst, unsigned Value,
-                                                 uint64_t Address,
-                                                 const MCDisassembler *Decoder);
+  unsigned Reg = getReg(Decoder, Mips::COP2RegClassID, RegNo);
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
 
-template <unsigned Bits, int Offset>
-static DecodeStatus DecodeUImmWithOffset(MCInst &Inst, unsigned Value,
+static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Insn,
                                          uint64_t Address,
                                          const MCDisassembler *Decoder) {
-  return DecodeUImmWithOffsetAndScale<Bits, Offset, 1>(Inst, Value, Address,
-                                                       Decoder);
-}
-
-template <unsigned Bits, int Offset = 0, int ScaleBy = 1>
-static DecodeStatus DecodeSImmWithOffsetAndScale(MCInst &Inst, unsigned Value,
-                                                 uint64_t Address,
-                                                 const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeInsSize(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                  const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeSimm19Lsl2(MCInst &Inst, unsigned Insn,
-                                     uint64_t Address,
-                                     const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeSimm18Lsl3(MCInst &Inst, unsigned Insn,
-                                     uint64_t Address,
-                                     const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeSimm9SP(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                  const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeANDI16Imm(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address,
-                                    const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeSimm23Lsl2(MCInst &Inst, unsigned Insn,
-                                     uint64_t Address,
-                                     const MCDisassembler *Decoder);
-
-/// INSVE_[BHWD] have an implicit operand that the generated decoder doesn't
-/// handle.
-template <typename InsnType>
-static DecodeStatus DecodeINSVE_DF(MCInst &MI, InsnType insn, uint64_t Address,
-                                   const MCDisassembler *Decoder);
-
-template <typename InsnType>
-static DecodeStatus DecodeDAHIDATIMMR6(MCInst &MI, InsnType insn,
-                                       uint64_t Address,
-                                       const MCDisassembler *Decoder);
-
-template <typename InsnType>
-static DecodeStatus DecodeDAHIDATI(MCInst &MI, InsnType insn, uint64_t Address,
-                                   const MCDisassembler *Decoder);
-
-template <typename InsnType>
-static DecodeStatus DecodeAddiGroupBranch(MCInst &MI, InsnType insn,
-                                          uint64_t Address,
-                                          const MCDisassembler *Decoder);
-
-template <typename InsnType>
-static DecodeStatus DecodePOP35GroupBranchMMR6(MCInst &MI, InsnType insn,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder);
-
-template <typename InsnType>
-static DecodeStatus DecodeDaddiGroupBranch(MCInst &MI, InsnType insn,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder);
-
-template <typename InsnType>
-static DecodeStatus DecodePOP37GroupBranchMMR6(MCInst &MI, InsnType insn,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder);
-
-template <typename InsnType>
-static DecodeStatus DecodePOP65GroupBranchMMR6(MCInst &MI, InsnType insn,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder);
-
-template <typename InsnType>
-static DecodeStatus DecodePOP75GroupBranchMMR6(MCInst &MI, InsnType insn,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder);
-
-template <typename InsnType>
-static DecodeStatus DecodeBlezlGroupBranch(MCInst &MI, InsnType insn,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder);
-
-template <typename InsnType>
-static DecodeStatus DecodeBgtzlGroupBranch(MCInst &MI, InsnType insn,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder);
-
-template <typename InsnType>
-static DecodeStatus DecodeBgtzGroupBranch(MCInst &MI, InsnType insn,
-                                          uint64_t Address,
-                                          const MCDisassembler *Decoder);
+  unsigned Regs[] = {Mips::S0, Mips::S1, Mips::S2, Mips::S3, Mips::S4,
+                     Mips::S5, Mips::S6, Mips::S7, Mips::FP};
+  unsigned RegNum;
 
-template <typename InsnType>
-static DecodeStatus DecodeBlezGroupBranch(MCInst &MI, InsnType insn,
-                                          uint64_t Address,
-                                          const MCDisassembler *Decoder);
+  unsigned RegLst = fieldFromInstruction(Insn, 21, 5);
 
-template <typename InsnType>
-static DecodeStatus DecodeBgtzGroupBranchMMR6(MCInst &MI, InsnType insn,
-                                              uint64_t Address,
-                                              const MCDisassembler *Decoder);
+  // Empty register lists are not allowed.
+  if (RegLst == 0)
+    return MCDisassembler::Fail;
 
-template <typename InsnType>
-static DecodeStatus DecodeBlezGroupBranchMMR6(MCInst &MI, InsnType insn,
-                                              uint64_t Address,
-                                              const MCDisassembler *Decoder);
+  RegNum = RegLst & 0xf;
 
-template <typename InsnType>
-static DecodeStatus DecodeDINS(MCInst &MI, InsnType Insn, uint64_t Address,
-                               const MCDisassembler *Decoder);
+  // RegLst values 10-15, and 26-31 are reserved.
+  if (RegNum > 9)
+    return MCDisassembler::Fail;
 
-template <typename InsnType>
-static DecodeStatus DecodeDEXT(MCInst &MI, InsnType Insn, uint64_t Address,
-                               const MCDisassembler *Decoder);
+  for (unsigned i = 0; i < RegNum; i++)
+    Inst.addOperand(MCOperand::createReg(Regs[i]));
 
-template <typename InsnType>
-static DecodeStatus DecodeCRC(MCInst &MI, InsnType Insn, uint64_t Address,
-                              const MCDisassembler *Decoder);
+  if (RegLst & 0x10)
+    Inst.addOperand(MCOperand::createReg(Mips::RA));
 
-static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Insn,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder);
+  return MCDisassembler::Success;
+}
 
 static DecodeStatus DecodeRegListOperand16(MCInst &Inst, unsigned Insn,
                                            uint64_t Address,
-                                           const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeMovePRegPair(MCInst &Inst, unsigned RegPair,
-                                       uint64_t Address,
-                                       const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeMovePOperands(MCInst &Inst, unsigned Insn,
-                                        uint64_t Address,
-                                        const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeFIXMEInstruction(MCInst &Inst, unsigned Insn,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder);
-
-static MCDisassembler *createMipsDisassembler(
-                       const Target &T,
-                       const MCSubtargetInfo &STI,
-                       MCContext &Ctx) {
-  return new MipsDisassembler(STI, Ctx, true);
-}
-
-static MCDisassembler *createMipselDisassembler(
-                       const Target &T,
-                       const MCSubtargetInfo &STI,
-                       MCContext &Ctx) {
-  return new MipsDisassembler(STI, Ctx, false);
-}
+                                           const MCDisassembler *Decoder) {
+  unsigned Regs[] = {Mips::S0, Mips::S1, Mips::S2, Mips::S3};
+  unsigned RegLst;
+  switch (Inst.getOpcode()) {
+  default:
+    RegLst = fieldFromInstruction(Insn, 4, 2);
+    break;
+  case Mips::LWM16_MMR6:
+  case Mips::SWM16_MMR6:
+    RegLst = fieldFromInstruction(Insn, 8, 2);
+    break;
+  }
+  unsigned RegNum = RegLst & 0x3;
 
-extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
-LLVMInitializeMipsDisassembler() {
-  // Register the disassembler.
-  TargetRegistry::RegisterMCDisassembler(getTheMipsTarget(),
-                                         createMipsDisassembler);
-  TargetRegistry::RegisterMCDisassembler(getTheMipselTarget(),
-                                         createMipselDisassembler);
-  TargetRegistry::RegisterMCDisassembler(getTheMips64Target(),
-                                         createMipsDisassembler);
-  TargetRegistry::RegisterMCDisassembler(getTheMips64elTarget(),
-                                         createMipselDisassembler);
-}
+  for (unsigned i = 0; i <= RegNum; i++)
+    Inst.addOperand(MCOperand::createReg(Regs[i]));
 
-#include "MipsGenDisassemblerTables.inc"
+  Inst.addOperand(MCOperand::createReg(Mips::RA));
 
-static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo) {
-  const MCRegisterInfo *RegInfo = D->getContext().getRegisterInfo();
-  return *(RegInfo->getRegClass(RC).begin() + RegNo);
+  return MCDisassembler::Success;
 }
 
 template <typename InsnType>
@@ -1095,247 +860,15 @@ static DecodeStatus DecodeCRC(MCInst &MI, InsnType Insn, uint64_t Address,
                               const MCDisassembler *Decoder) {
   InsnType Rs = fieldFromInstruction(Insn, 21, 5);
   InsnType Rt = fieldFromInstruction(Insn, 16, 5);
-  MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
-                                     Rt)));
-  MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
-                                     Rs)));
-  MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
-                                     Rt)));
-  return MCDisassembler::Success;
-}
-
-/// Read two bytes from the ArrayRef and return 16 bit halfword sorted
-/// according to the given endianness.
-static DecodeStatus readInstruction16(ArrayRef<uint8_t> Bytes, uint64_t Address,
-                                      uint64_t &Size, uint32_t &Insn,
-                                      bool IsBigEndian) {
-  // We want to read exactly 2 Bytes of data.
-  if (Bytes.size() < 2) {
-    Size = 0;
-    return MCDisassembler::Fail;
-  }
-
-  if (IsBigEndian) {
-    Insn = (Bytes[0] << 8) | Bytes[1];
-  } else {
-    Insn = (Bytes[1] << 8) | Bytes[0];
-  }
-
-  return MCDisassembler::Success;
-}
-
-/// Read four bytes from the ArrayRef and return 32 bit word sorted
-/// according to the given endianness.
-static DecodeStatus readInstruction32(ArrayRef<uint8_t> Bytes, uint64_t Address,
-                                      uint64_t &Size, uint32_t &Insn,
-                                      bool IsBigEndian, bool IsMicroMips) {
-  // We want to read exactly 4 Bytes of data.
-  if (Bytes.size() < 4) {
-    Size = 0;
-    return MCDisassembler::Fail;
-  }
-
-  // High 16 bits of a 32-bit microMIPS instruction (where the opcode is)
-  // always precede the low 16 bits in the instruction stream (that is, they
-  // are placed at lower addresses in the instruction stream).
-  //
-  // microMIPS byte ordering:
-  //   Big-endian:    0 | 1 | 2 | 3
-  //   Little-endian: 1 | 0 | 3 | 2
-
-  if (IsBigEndian) {
-    // Encoded as a big-endian 32-bit word in the stream.
-    Insn =
-        (Bytes[3] << 0) | (Bytes[2] << 8) | (Bytes[1] << 16) | (Bytes[0] << 24);
-  } else {
-    if (IsMicroMips) {
-      Insn = (Bytes[2] << 0) | (Bytes[3] << 8) | (Bytes[0] << 16) |
-             (Bytes[1] << 24);
-    } else {
-      Insn = (Bytes[0] << 0) | (Bytes[1] << 8) | (Bytes[2] << 16) |
-             (Bytes[3] << 24);
-    }
-  }
-
+  MI.addOperand(
+      MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID, Rt)));
+  MI.addOperand(
+      MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID, Rs)));
+  MI.addOperand(
+      MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID, Rt)));
   return MCDisassembler::Success;
 }
 
-DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
-                                              ArrayRef<uint8_t> Bytes,
-                                              uint64_t Address,
-                                              raw_ostream &CStream) const {
-  uint32_t Insn;
-  DecodeStatus Result;
-  Size = 0;
-
-  if (IsMicroMips) {
-    Result = readInstruction16(Bytes, Address, Size, Insn, IsBigEndian);
-    if (Result == MCDisassembler::Fail)
-      return MCDisassembler::Fail;
-
-    if (hasMips32r6()) {
-      LLVM_DEBUG(
-          dbgs() << "Trying MicroMipsR616 table (16-bit instructions):\n");
-      // Calling the auto-generated decoder function for microMIPS32R6
-      // 16-bit instructions.
-      Result = decodeInstruction(DecoderTableMicroMipsR616, Instr, Insn,
-                                 Address, this, STI);
-      if (Result != MCDisassembler::Fail) {
-        Size = 2;
-        return Result;
-      }
-    }
-
-    LLVM_DEBUG(dbgs() << "Trying MicroMips16 table (16-bit instructions):\n");
-    // Calling the auto-generated decoder function for microMIPS 16-bit
-    // instructions.
-    Result = decodeInstruction(DecoderTableMicroMips16, Instr, Insn, Address,
-                               this, STI);
-    if (Result != MCDisassembler::Fail) {
-      Size = 2;
-      return Result;
-    }
-
-    Result = readInstruction32(Bytes, Address, Size, Insn, IsBigEndian, true);
-    if (Result == MCDisassembler::Fail)
-      return MCDisassembler::Fail;
-
-    if (hasMips32r6()) {
-      LLVM_DEBUG(
-          dbgs() << "Trying MicroMips32r632 table (32-bit instructions):\n");
-      // Calling the auto-generated decoder function.
-      Result = decodeInstruction(DecoderTableMicroMipsR632, Instr, Insn,
-                                 Address, this, STI);
-      if (Result != MCDisassembler::Fail) {
-        Size = 4;
-        return Result;
-      }
-    }
-
-    LLVM_DEBUG(dbgs() << "Trying MicroMips32 table (32-bit instructions):\n");
-    // Calling the auto-generated decoder function.
-    Result = decodeInstruction(DecoderTableMicroMips32, Instr, Insn, Address,
-                               this, STI);
-    if (Result != MCDisassembler::Fail) {
-      Size = 4;
-      return Result;
-    }
-
-    if (isFP64()) {
-      LLVM_DEBUG(dbgs() << "Trying MicroMipsFP64 table (32-bit opcodes):\n");
-      Result = decodeInstruction(DecoderTableMicroMipsFP6432, Instr, Insn,
-                                 Address, this, STI);
-      if (Result != MCDisassembler::Fail) {
-        Size = 4;
-        return Result;
-      }
-    }
-
-    // This is an invalid instruction. Claim that the Size is 2 bytes. Since
-    // microMIPS instructions have a minimum alignment of 2, the next 2 bytes
-    // could form a valid instruction. The two bytes we rejected as an
-    // instruction could have actually beeen an inline constant pool that is
-    // unconditionally branched over.
-    Size = 2;
-    return MCDisassembler::Fail;
-  }
-
-  // Attempt to read the instruction so that we can attempt to decode it. If
-  // the buffer is not 4 bytes long, let the higher level logic figure out
-  // what to do with a size of zero and MCDisassembler::Fail.
-  Result = readInstruction32(Bytes, Address, Size, Insn, IsBigEndian, false);
-  if (Result == MCDisassembler::Fail)
-    return MCDisassembler::Fail;
-
-  // The only instruction size for standard encoded MIPS.
-  Size = 4;
-
-  if (hasCOP3()) {
-    LLVM_DEBUG(dbgs() << "Trying COP3_ table (32-bit opcodes):\n");
-    Result =
-        decodeInstruction(DecoderTableCOP3_32, Instr, Insn, Address, this, STI);
-    if (Result != MCDisassembler::Fail)
-      return Result;
-  }
-
-  if (hasMips32r6() && isGP64()) {
-    LLVM_DEBUG(
-        dbgs() << "Trying Mips32r6_64r6 (GPR64) table (32-bit opcodes):\n");
-    Result = decodeInstruction(DecoderTableMips32r6_64r6_GP6432, Instr, Insn,
-                               Address, this, STI);
-    if (Result != MCDisassembler::Fail)
-      return Result;
-  }
-
-  if (hasMips32r6() && isPTR64()) {
-    LLVM_DEBUG(
-        dbgs() << "Trying Mips32r6_64r6 (PTR64) table (32-bit opcodes):\n");
-    Result = decodeInstruction(DecoderTableMips32r6_64r6_PTR6432, Instr, Insn,
-                               Address, this, STI);
-    if (Result != MCDisassembler::Fail)
-      return Result;
-  }
-
-  if (hasMips32r6()) {
-    LLVM_DEBUG(dbgs() << "Trying Mips32r6_64r6 table (32-bit opcodes):\n");
-    Result = decodeInstruction(DecoderTableMips32r6_64r632, Instr, Insn,
-                               Address, this, STI);
-    if (Result != MCDisassembler::Fail)
-      return Result;
-  }
-
-  if (hasMips2() && isPTR64()) {
-    LLVM_DEBUG(
-        dbgs() << "Trying Mips32r6_64r6 (PTR64) table (32-bit opcodes):\n");
-    Result = decodeInstruction(DecoderTableMips32_64_PTR6432, Instr, Insn,
-                               Address, this, STI);
-    if (Result != MCDisassembler::Fail)
-      return Result;
-  }
-
-  if (hasCnMips()) {
-    LLVM_DEBUG(dbgs() << "Trying CnMips table (32-bit opcodes):\n");
-    Result = decodeInstruction(DecoderTableCnMips32, Instr, Insn,
-                               Address, this, STI);
-    if (Result != MCDisassembler::Fail)
-      return Result;
-  }
-
-  if (hasCnMipsP()) {
-    LLVM_DEBUG(dbgs() << "Trying CnMipsP table (32-bit opcodes):\n");
-    Result = decodeInstruction(DecoderTableCnMipsP32, Instr, Insn,
-                               Address, this, STI);
-    if (Result != MCDisassembler::Fail)
-      return Result;
-  }
-
-  if (isGP64()) {
-    LLVM_DEBUG(dbgs() << "Trying Mips64 (GPR64) table (32-bit opcodes):\n");
-    Result = decodeInstruction(DecoderTableMips6432, Instr, Insn,
-                               Address, this, STI);
-    if (Result != MCDisassembler::Fail)
-      return Result;
-  }
-
-  if (isFP64()) {
-    LLVM_DEBUG(
-        dbgs() << "Trying MipsFP64 (64 bit FPU) table (32-bit opcodes):\n");
-    Result = decodeInstruction(DecoderTableMipsFP6432, Instr, Insn,
-                               Address, this, STI);
-    if (Result != MCDisassembler::Fail)
-      return Result;
-  }
-
-  LLVM_DEBUG(dbgs() << "Trying Mips table (32-bit opcodes):\n");
-  // Calling the auto-generated decoder function.
-  Result =
-      decodeInstruction(DecoderTableMips32, Instr, Insn, Address, this, STI);
-  if (Result != MCDisassembler::Fail)
-    return Result;
-
-  return MCDisassembler::Fail;
-}
-
 static DecodeStatus
 DecodeCPU16RegsRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
                              const MCDisassembler *Decoder) {
@@ -1971,137 +1504,6 @@ static DecodeStatus DecodeSpecial3LlSc(MCInst &Inst, unsigned Insn,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeHWRegsRegisterClass(MCInst &Inst, unsigned RegNo,
-                                              uint64_t Address,
-                                              const MCDisassembler *Decoder) {
-  // Currently only hardware register 29 is supported.
-  if (RegNo != 29)
-    return  MCDisassembler::Fail;
-  Inst.addOperand(MCOperand::createReg(Mips::HWR29));
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeAFGR64RegisterClass(MCInst &Inst, unsigned RegNo,
-                                              uint64_t Address,
-                                              const MCDisassembler *Decoder) {
-  if (RegNo > 30 || RegNo %2)
-    return MCDisassembler::Fail;
-
-  unsigned Reg = getReg(Decoder, Mips::AFGR64RegClassID, RegNo /2);
-  Inst.addOperand(MCOperand::createReg(Reg));
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeACC64DSPRegisterClass(MCInst &Inst, unsigned RegNo,
-                                                uint64_t Address,
-                                                const MCDisassembler *Decoder) {
-  if (RegNo >= 4)
-    return MCDisassembler::Fail;
-
-  unsigned Reg = getReg(Decoder, Mips::ACC64DSPRegClassID, RegNo);
-  Inst.addOperand(MCOperand::createReg(Reg));
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeHI32DSPRegisterClass(MCInst &Inst, unsigned RegNo,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder) {
-  if (RegNo >= 4)
-    return MCDisassembler::Fail;
-
-  unsigned Reg = getReg(Decoder, Mips::HI32DSPRegClassID, RegNo);
-  Inst.addOperand(MCOperand::createReg(Reg));
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeLO32DSPRegisterClass(MCInst &Inst, unsigned RegNo,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder) {
-  if (RegNo >= 4)
-    return MCDisassembler::Fail;
-
-  unsigned Reg = getReg(Decoder, Mips::LO32DSPRegClassID, RegNo);
-  Inst.addOperand(MCOperand::createReg(Reg));
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeMSA128BRegisterClass(MCInst &Inst, unsigned RegNo,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder) {
-  if (RegNo > 31)
-    return MCDisassembler::Fail;
-
-  unsigned Reg = getReg(Decoder, Mips::MSA128BRegClassID, RegNo);
-  Inst.addOperand(MCOperand::createReg(Reg));
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeMSA128HRegisterClass(MCInst &Inst, unsigned RegNo,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder) {
-  if (RegNo > 31)
-    return MCDisassembler::Fail;
-
-  unsigned Reg = getReg(Decoder, Mips::MSA128HRegClassID, RegNo);
-  Inst.addOperand(MCOperand::createReg(Reg));
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeMSA128WRegisterClass(MCInst &Inst, unsigned RegNo,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder) {
-  if (RegNo > 31)
-    return MCDisassembler::Fail;
-
-  unsigned Reg = getReg(Decoder, Mips::MSA128WRegClassID, RegNo);
-  Inst.addOperand(MCOperand::createReg(Reg));
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeMSA128DRegisterClass(MCInst &Inst, unsigned RegNo,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder) {
-  if (RegNo > 31)
-    return MCDisassembler::Fail;
-
-  unsigned Reg = getReg(Decoder, Mips::MSA128DRegClassID, RegNo);
-  Inst.addOperand(MCOperand::createReg(Reg));
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeMSACtrlRegisterClass(MCInst &Inst, unsigned RegNo,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder) {
-  if (RegNo > 7)
-    return MCDisassembler::Fail;
-
-  unsigned Reg = getReg(Decoder, Mips::MSACtrlRegClassID, RegNo);
-  Inst.addOperand(MCOperand::createReg(Reg));
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeCOP0RegisterClass(MCInst &Inst, unsigned RegNo,
-                                            uint64_t Address,
-                                            const MCDisassembler *Decoder) {
-  if (RegNo > 31)
-    return MCDisassembler::Fail;
-
-  unsigned Reg = getReg(Decoder, Mips::COP0RegClassID, RegNo);
-  Inst.addOperand(MCOperand::createReg(Reg));
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeCOP2RegisterClass(MCInst &Inst, unsigned RegNo,
-                                            uint64_t Address,
-                                            const MCDisassembler *Decoder) {
-  if (RegNo > 31)
-    return MCDisassembler::Fail;
-
-  unsigned Reg = getReg(Decoder, Mips::COP2RegClassID, RegNo);
-  Inst.addOperand(MCOperand::createReg(Reg));
-  return MCDisassembler::Success;
-}
-
 static DecodeStatus DecodeBranchTarget(MCInst &Inst, unsigned Offset,
                                        uint64_t Address,
                                        const MCDisassembler *Decoder) {
@@ -2241,7 +1643,7 @@ DecodeUImmWithOffsetAndScale(MCInst &Inst, unsigned Value, uint64_t Address,
   return MCDisassembler::Success;
 }
 
-template <unsigned Bits, int Offset, int ScaleBy>
+template <unsigned Bits, int Offset = 0, int ScaleBy = 1>
 static DecodeStatus
 DecodeSImmWithOffsetAndScale(MCInst &Inst, unsigned Value, uint64_t Address,
                              const MCDisassembler *Decoder) {
@@ -2250,6 +1652,14 @@ DecodeSImmWithOffsetAndScale(MCInst &Inst, unsigned Value, uint64_t Address,
   return MCDisassembler::Success;
 }
 
+template <unsigned Bits, int Offset>
+static DecodeStatus DecodeUImmWithOffset(MCInst &Inst, unsigned Value,
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder) {
+  return DecodeUImmWithOffsetAndScale<Bits, Offset, 1>(Inst, Value, Address,
+                                                       Decoder);
+}
+
 static DecodeStatus DecodeInsSize(MCInst &Inst, unsigned Insn, uint64_t Address,
                                   const MCDisassembler *Decoder) {
   // First we need to grab the pos(lsb) from MCInst.
@@ -2294,90 +1704,12 @@ static DecodeStatus DecodeANDI16Imm(MCInst &Inst, unsigned Insn,
                                     const MCDisassembler *Decoder) {
   // Insn must be >= 0, since it is unsigned that condition is always true.
   assert(Insn < 16);
-  int32_t DecodedValues[] = {128, 1, 2, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64,
-                             255, 32768, 65535};
+  int32_t DecodedValues[] = {128, 1,  2,  3,  4,  7,   8,     15,
+                             16,  31, 32, 63, 64, 255, 32768, 65535};
   Inst.addOperand(MCOperand::createImm(DecodedValues[Insn]));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Insn,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder) {
-  unsigned Regs[] = {Mips::S0, Mips::S1, Mips::S2, Mips::S3, Mips::S4, Mips::S5,
-                     Mips::S6, Mips::S7, Mips::FP};
-  unsigned RegNum;
-
-  unsigned RegLst = fieldFromInstruction(Insn, 21, 5);
-
-  // Empty register lists are not allowed.
-  if (RegLst == 0)
-    return MCDisassembler::Fail;
-
-  RegNum = RegLst & 0xf;
-
-  // RegLst values 10-15, and 26-31 are reserved.
-  if (RegNum > 9)
-    return MCDisassembler::Fail;
-
-  for (unsigned i = 0; i < RegNum; i++)
-    Inst.addOperand(MCOperand::createReg(Regs[i]));
-
-  if (RegLst & 0x10)
-    Inst.addOperand(MCOperand::createReg(Mips::RA));
-
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeRegListOperand16(MCInst &Inst, unsigned Insn,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder) {
-  unsigned Regs[] = {Mips::S0, Mips::S1, Mips::S2, Mips::S3};
-  unsigned RegLst;
-  switch(Inst.getOpcode()) {
-  default:
-    RegLst = fieldFromInstruction(Insn, 4, 2);
-    break;
-  case Mips::LWM16_MMR6:
-  case Mips::SWM16_MMR6:
-    RegLst = fieldFromInstruction(Insn, 8, 2);
-    break;
-  }
-  unsigned RegNum = RegLst & 0x3;
-
-  for (unsigned i = 0; i <= RegNum; i++)
-    Inst.addOperand(MCOperand::createReg(Regs[i]));
-
-  Inst.addOperand(MCOperand::createReg(Mips::RA));
-
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeMovePOperands(MCInst &Inst, unsigned Insn,
-                                        uint64_t Address,
-                                        const MCDisassembler *Decoder) {
-  unsigned RegPair = fieldFromInstruction(Insn, 7, 3);
-  if (DecodeMovePRegPair(Inst, RegPair, Address, Decoder) ==
-      MCDisassembler::Fail)
-    return MCDisassembler::Fail;
-
-  unsigned RegRs;
-  if (static_cast<const MipsDisassembler*>(Decoder)->hasMips32r6())
-    RegRs = fieldFromInstruction(Insn, 0, 2) |
-            (fieldFromInstruction(Insn, 3, 1) << 2);
-  else
-    RegRs = fieldFromInstruction(Insn, 1, 3);
-  if (DecodeGPRMM16MovePRegisterClass(Inst, RegRs, Address, Decoder) ==
-      MCDisassembler::Fail)
-    return MCDisassembler::Fail;
-
-  unsigned RegRt = fieldFromInstruction(Insn, 4, 3);
-  if (DecodeGPRMM16MovePRegisterClass(Inst, RegRt, Address, Decoder) ==
-      MCDisassembler::Fail)
-    return MCDisassembler::Fail;
-
-  return MCDisassembler::Success;
-}
-
 static DecodeStatus DecodeMovePRegPair(MCInst &Inst, unsigned RegPair,
                                        uint64_t Address,
                                        const MCDisassembler *Decoder) {
@@ -2421,6 +1753,32 @@ static DecodeStatus DecodeMovePRegPair(MCInst &Inst, unsigned RegPair,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeMovePOperands(MCInst &Inst, unsigned Insn,
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder) {
+  unsigned RegPair = fieldFromInstruction(Insn, 7, 3);
+  if (DecodeMovePRegPair(Inst, RegPair, Address, Decoder) ==
+      MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+
+  unsigned RegRs;
+  if (static_cast<const MipsDisassembler *>(Decoder)->hasMips32r6())
+    RegRs = fieldFromInstruction(Insn, 0, 2) |
+            (fieldFromInstruction(Insn, 3, 1) << 2);
+  else
+    RegRs = fieldFromInstruction(Insn, 1, 3);
+  if (DecodeGPRMM16MovePRegisterClass(Inst, RegRs, Address, Decoder) ==
+      MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+
+  unsigned RegRt = fieldFromInstruction(Insn, 4, 3);
+  if (DecodeGPRMM16MovePRegisterClass(Inst, RegRt, Address, Decoder) ==
+      MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeSimm23Lsl2(MCInst &Inst, unsigned Insn,
                                      uint64_t Address,
                                      const MCDisassembler *Decoder) {
@@ -2528,3 +1886,237 @@ static DecodeStatus DecodeFIXMEInstruction(MCInst &Inst, unsigned Insn,
                                            const MCDisassembler *Decoder) {
   return MCDisassembler::Fail;
 }
+
+#include "MipsGenDisassemblerTables.inc"
+
+/// Read two bytes from the ArrayRef and return 16 bit halfword sorted
+/// according to the given endianness.
+static DecodeStatus readInstruction16(ArrayRef<uint8_t> Bytes, uint64_t Address,
+                                      uint64_t &Size, uint32_t &Insn,
+                                      bool IsBigEndian) {
+  // We want to read exactly 2 Bytes of data.
+  if (Bytes.size() < 2) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  if (IsBigEndian) {
+    Insn = (Bytes[0] << 8) | Bytes[1];
+  } else {
+    Insn = (Bytes[1] << 8) | Bytes[0];
+  }
+
+  return MCDisassembler::Success;
+}
+
+/// Read four bytes from the ArrayRef and return 32 bit word sorted
+/// according to the given endianness.
+static DecodeStatus readInstruction32(ArrayRef<uint8_t> Bytes, uint64_t Address,
+                                      uint64_t &Size, uint32_t &Insn,
+                                      bool IsBigEndian, bool IsMicroMips) {
+  // We want to read exactly 4 Bytes of data.
+  if (Bytes.size() < 4) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  // High 16 bits of a 32-bit microMIPS instruction (where the opcode is)
+  // always precede the low 16 bits in the instruction stream (that is, they
+  // are placed at lower addresses in the instruction stream).
+  //
+  // microMIPS byte ordering:
+  //   Big-endian:    0 | 1 | 2 | 3
+  //   Little-endian: 1 | 0 | 3 | 2
+
+  if (IsBigEndian) {
+    // Encoded as a big-endian 32-bit word in the stream.
+    Insn =
+        (Bytes[3] << 0) | (Bytes[2] << 8) | (Bytes[1] << 16) | (Bytes[0] << 24);
+  } else {
+    if (IsMicroMips) {
+      Insn = (Bytes[2] << 0) | (Bytes[3] << 8) | (Bytes[0] << 16) |
+             (Bytes[1] << 24);
+    } else {
+      Insn = (Bytes[0] << 0) | (Bytes[1] << 8) | (Bytes[2] << 16) |
+             (Bytes[3] << 24);
+    }
+  }
+
+  return MCDisassembler::Success;
+}
+
+DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
+                                              ArrayRef<uint8_t> Bytes,
+                                              uint64_t Address,
+                                              raw_ostream &CStream) const {
+  uint32_t Insn;
+  DecodeStatus Result;
+  Size = 0;
+
+  if (IsMicroMips) {
+    Result = readInstruction16(Bytes, Address, Size, Insn, IsBigEndian);
+    if (Result == MCDisassembler::Fail)
+      return MCDisassembler::Fail;
+
+    if (hasMips32r6()) {
+      LLVM_DEBUG(
+          dbgs() << "Trying MicroMipsR616 table (16-bit instructions):\n");
+      // Calling the auto-generated decoder function for microMIPS32R6
+      // 16-bit instructions.
+      Result = decodeInstruction(DecoderTableMicroMipsR616, Instr, Insn,
+                                 Address, this, STI);
+      if (Result != MCDisassembler::Fail) {
+        Size = 2;
+        return Result;
+      }
+    }
+
+    LLVM_DEBUG(dbgs() << "Trying MicroMips16 table (16-bit instructions):\n");
+    // Calling the auto-generated decoder function for microMIPS 16-bit
+    // instructions.
+    Result = decodeInstruction(DecoderTableMicroMips16, Instr, Insn, Address,
+                               this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 2;
+      return Result;
+    }
+
+    Result = readInstruction32(Bytes, Address, Size, Insn, IsBigEndian, true);
+    if (Result == MCDisassembler::Fail)
+      return MCDisassembler::Fail;
+
+    if (hasMips32r6()) {
+      LLVM_DEBUG(
+          dbgs() << "Trying MicroMips32r632 table (32-bit instructions):\n");
+      // Calling the auto-generated decoder function.
+      Result = decodeInstruction(DecoderTableMicroMipsR632, Instr, Insn,
+                                 Address, this, STI);
+      if (Result != MCDisassembler::Fail) {
+        Size = 4;
+        return Result;
+      }
+    }
+
+    LLVM_DEBUG(dbgs() << "Trying MicroMips32 table (32-bit instructions):\n");
+    // Calling the auto-generated decoder function.
+    Result = decodeInstruction(DecoderTableMicroMips32, Instr, Insn, Address,
+                               this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      return Result;
+    }
+
+    if (isFP64()) {
+      LLVM_DEBUG(dbgs() << "Trying MicroMipsFP64 table (32-bit opcodes):\n");
+      Result = decodeInstruction(DecoderTableMicroMipsFP6432, Instr, Insn,
+                                 Address, this, STI);
+      if (Result != MCDisassembler::Fail) {
+        Size = 4;
+        return Result;
+      }
+    }
+
+    // This is an invalid instruction. Claim that the Size is 2 bytes. Since
+    // microMIPS instructions have a minimum alignment of 2, the next 2 bytes
+    // could form a valid instruction. The two bytes we rejected as an
+    // instruction could have actually beeen an inline constant pool that is
+    // unconditionally branched over.
+    Size = 2;
+    return MCDisassembler::Fail;
+  }
+
+  // Attempt to read the instruction so that we can attempt to decode it. If
+  // the buffer is not 4 bytes long, let the higher level logic figure out
+  // what to do with a size of zero and MCDisassembler::Fail.
+  Result = readInstruction32(Bytes, Address, Size, Insn, IsBigEndian, false);
+  if (Result == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+
+  // The only instruction size for standard encoded MIPS.
+  Size = 4;
+
+  if (hasCOP3()) {
+    LLVM_DEBUG(dbgs() << "Trying COP3_ table (32-bit opcodes):\n");
+    Result =
+        decodeInstruction(DecoderTableCOP3_32, Instr, Insn, Address, this, STI);
+    if (Result != MCDisassembler::Fail)
+      return Result;
+  }
+
+  if (hasMips32r6() && isGP64()) {
+    LLVM_DEBUG(
+        dbgs() << "Trying Mips32r6_64r6 (GPR64) table (32-bit opcodes):\n");
+    Result = decodeInstruction(DecoderTableMips32r6_64r6_GP6432, Instr, Insn,
+                               Address, this, STI);
+    if (Result != MCDisassembler::Fail)
+      return Result;
+  }
+
+  if (hasMips32r6() && isPTR64()) {
+    LLVM_DEBUG(
+        dbgs() << "Trying Mips32r6_64r6 (PTR64) table (32-bit opcodes):\n");
+    Result = decodeInstruction(DecoderTableMips32r6_64r6_PTR6432, Instr, Insn,
+                               Address, this, STI);
+    if (Result != MCDisassembler::Fail)
+      return Result;
+  }
+
+  if (hasMips32r6()) {
+    LLVM_DEBUG(dbgs() << "Trying Mips32r6_64r6 table (32-bit opcodes):\n");
+    Result = decodeInstruction(DecoderTableMips32r6_64r632, Instr, Insn,
+                               Address, this, STI);
+    if (Result != MCDisassembler::Fail)
+      return Result;
+  }
+
+  if (hasMips2() && isPTR64()) {
+    LLVM_DEBUG(
+        dbgs() << "Trying Mips32r6_64r6 (PTR64) table (32-bit opcodes):\n");
+    Result = decodeInstruction(DecoderTableMips32_64_PTR6432, Instr, Insn,
+                               Address, this, STI);
+    if (Result != MCDisassembler::Fail)
+      return Result;
+  }
+
+  if (hasCnMips()) {
+    LLVM_DEBUG(dbgs() << "Trying CnMips table (32-bit opcodes):\n");
+    Result = decodeInstruction(DecoderTableCnMips32, Instr, Insn, Address, this,
+                               STI);
+    if (Result != MCDisassembler::Fail)
+      return Result;
+  }
+
+  if (hasCnMipsP()) {
+    LLVM_DEBUG(dbgs() << "Trying CnMipsP table (32-bit opcodes):\n");
+    Result = decodeInstruction(DecoderTableCnMipsP32, Instr, Insn, Address,
+                               this, STI);
+    if (Result != MCDisassembler::Fail)
+      return Result;
+  }
+
+  if (isGP64()) {
+    LLVM_DEBUG(dbgs() << "Trying Mips64 (GPR64) table (32-bit opcodes):\n");
+    Result = decodeInstruction(DecoderTableMips6432, Instr, Insn, Address, this,
+                               STI);
+    if (Result != MCDisassembler::Fail)
+      return Result;
+  }
+
+  if (isFP64()) {
+    LLVM_DEBUG(
+        dbgs() << "Trying MipsFP64 (64 bit FPU) table (32-bit opcodes):\n");
+    Result = decodeInstruction(DecoderTableMipsFP6432, Instr, Insn, Address,
+                               this, STI);
+    if (Result != MCDisassembler::Fail)
+      return Result;
+  }
+
+  LLVM_DEBUG(dbgs() << "Trying Mips table (32-bit opcodes):\n");
+  // Calling the auto-generated decoder function.
+  Result =
+      decodeInstruction(DecoderTableMips32, Instr, Insn, Address, this, STI);
+  if (Result != MCDisassembler::Fail)
+    return Result;
+
+  return MCDisassembler::Fail;
+}
diff --git a/llvm/lib/Target/Mips/Mips16ISelLowering.cpp b/llvm/lib/Target/Mips/Mips16ISelLowering.cpp
index 330cb4e0e206..7bd96b571bc6 100644
--- a/llvm/lib/Target/Mips/Mips16ISelLowering.cpp
+++ b/llvm/lib/Target/Mips/Mips16ISelLowering.cpp
@@ -56,48 +56,52 @@ struct Mips16IntrinsicHelperType{
 
 // Libcalls for which no helper is generated. Sorted by name for binary search.
 static const Mips16Libcall HardFloatLibCalls[] = {
-    {RTLIB::ADD_F64, RTLIB::__mips16_adddf3, "__mips16_adddf3"},
-    {RTLIB::ADD_F32, RTLIB::__mips16_addsf3, "__mips16_addsf3"},
-    {RTLIB::DIV_F64, RTLIB::__mips16_divdf3, "__mips16_divdf3"},
-    {RTLIB::DIV_F32, RTLIB::__mips16_divsf3, "__mips16_divsf3"},
-    {RTLIB::OEQ_F64, RTLIB::__mips16_eqdf2, "__mips16_eqdf2"},
-    {RTLIB::OEQ_F32, RTLIB::__mips16_eqsf2, "__mips16_eqsf2"},
-    {RTLIB::FPEXT_F32_F64, RTLIB::__mips16_extendsfdf2, "__mips16_extendsfdf2"},
-    {RTLIB::FPTOSINT_F64_I32, RTLIB::__mips16_fix_truncdfsi,
+    {RTLIB::ADD_F64, RTLIB::impl___mips16_adddf3, "__mips16_adddf3"},
+    {RTLIB::ADD_F32, RTLIB::impl___mips16_addsf3, "__mips16_addsf3"},
+    {RTLIB::DIV_F64, RTLIB::impl___mips16_divdf3, "__mips16_divdf3"},
+    {RTLIB::DIV_F32, RTLIB::impl___mips16_divsf3, "__mips16_divsf3"},
+    {RTLIB::OEQ_F64, RTLIB::impl___mips16_eqdf2, "__mips16_eqdf2"},
+    {RTLIB::OEQ_F32, RTLIB::impl___mips16_eqsf2, "__mips16_eqsf2"},
+    {RTLIB::FPEXT_F32_F64, RTLIB::impl___mips16_extendsfdf2,
+     "__mips16_extendsfdf2"},
+    {RTLIB::FPTOSINT_F64_I32, RTLIB::impl___mips16_fix_truncdfsi,
      "__mips16_fix_truncdfsi"},
-    {RTLIB::FPTOSINT_F32_I32, RTLIB::__mips16_fix_truncsfsi,
+    {RTLIB::FPTOSINT_F32_I32, RTLIB::impl___mips16_fix_truncsfsi,
      "__mips16_fix_truncsfsi"},
-    {RTLIB::SINTTOFP_I32_F64, RTLIB::__mips16_floatsidf, "__mips16_floatsidf"},
-    {RTLIB::SINTTOFP_I32_F32, RTLIB::__mips16_floatsisf, "__mips16_floatsisf"},
-    {RTLIB::UINTTOFP_I32_F64, RTLIB::__mips16_floatunsidf,
+    {RTLIB::SINTTOFP_I32_F64, RTLIB::impl___mips16_floatsidf,
+     "__mips16_floatsidf"},
+    {RTLIB::SINTTOFP_I32_F32, RTLIB::impl___mips16_floatsisf,
+     "__mips16_floatsisf"},
+    {RTLIB::UINTTOFP_I32_F64, RTLIB::impl___mips16_floatunsidf,
      "__mips16_floatunsidf"},
-    {RTLIB::UINTTOFP_I32_F32, RTLIB::__mips16_floatunsisf,
+    {RTLIB::UINTTOFP_I32_F32, RTLIB::impl___mips16_floatunsisf,
      "__mips16_floatunsisf"},
-    {RTLIB::OGE_F64, RTLIB::__mips16_gedf2, "__mips16_gedf2"},
-    {RTLIB::OGE_F32, RTLIB::__mips16_gesf2, "__mips16_gesf2"},
-    {RTLIB::OGT_F64, RTLIB::__mips16_gtdf2, "__mips16_gtdf2"},
-    {RTLIB::OGT_F32, RTLIB::__mips16_gtsf2, "__mips16_gtsf2"},
-    {RTLIB::OLE_F64, RTLIB::__mips16_ledf2, "__mips16_ledf2"},
-    {RTLIB::OLE_F32, RTLIB::__mips16_lesf2, "__mips16_lesf2"},
-    {RTLIB::OLT_F64, RTLIB::__mips16_ltdf2, "__mips16_ltdf2"},
-    {RTLIB::OLT_F32, RTLIB::__mips16_ltsf2, "__mips16_ltsf2"},
-    {RTLIB::MUL_F64, RTLIB::__mips16_muldf3, "__mips16_muldf3"},
-    {RTLIB::MUL_F32, RTLIB::__mips16_mulsf3, "__mips16_mulsf3"},
-    {RTLIB::UNE_F64, RTLIB::__mips16_nedf2, "__mips16_nedf2"},
-    {RTLIB::UNE_F32, RTLIB::__mips16_nesf2, "__mips16_nesf2"},
-    {RTLIB::UNKNOWN_LIBCALL, RTLIB::__mips16_ret_dc,
+    {RTLIB::OGE_F64, RTLIB::impl___mips16_gedf2, "__mips16_gedf2"},
+    {RTLIB::OGE_F32, RTLIB::impl___mips16_gesf2, "__mips16_gesf2"},
+    {RTLIB::OGT_F64, RTLIB::impl___mips16_gtdf2, "__mips16_gtdf2"},
+    {RTLIB::OGT_F32, RTLIB::impl___mips16_gtsf2, "__mips16_gtsf2"},
+    {RTLIB::OLE_F64, RTLIB::impl___mips16_ledf2, "__mips16_ledf2"},
+    {RTLIB::OLE_F32, RTLIB::impl___mips16_lesf2, "__mips16_lesf2"},
+    {RTLIB::OLT_F64, RTLIB::impl___mips16_ltdf2, "__mips16_ltdf2"},
+    {RTLIB::OLT_F32, RTLIB::impl___mips16_ltsf2, "__mips16_ltsf2"},
+    {RTLIB::MUL_F64, RTLIB::impl___mips16_muldf3, "__mips16_muldf3"},
+    {RTLIB::MUL_F32, RTLIB::impl___mips16_mulsf3, "__mips16_mulsf3"},
+    {RTLIB::UNE_F64, RTLIB::impl___mips16_nedf2, "__mips16_nedf2"},
+    {RTLIB::UNE_F32, RTLIB::impl___mips16_nesf2, "__mips16_nesf2"},
+    {RTLIB::UNKNOWN_LIBCALL, RTLIB::impl___mips16_ret_dc,
      "__mips16_ret_dc"}, // No associated libcall.
-    {RTLIB::UNKNOWN_LIBCALL, RTLIB::__mips16_ret_df,
+    {RTLIB::UNKNOWN_LIBCALL, RTLIB::impl___mips16_ret_df,
      "__mips16_ret_df"}, // No associated libcall.
-    {RTLIB::UNKNOWN_LIBCALL, RTLIB::__mips16_ret_sc,
+    {RTLIB::UNKNOWN_LIBCALL, RTLIB::impl___mips16_ret_sc,
      "__mips16_ret_sc"}, // No associated libcall.
-    {RTLIB::UNKNOWN_LIBCALL, RTLIB::__mips16_ret_sf,
+    {RTLIB::UNKNOWN_LIBCALL, RTLIB::impl___mips16_ret_sf,
      "__mips16_ret_sf"}, // No associated libcall.
-    {RTLIB::SUB_F64, RTLIB::__mips16_subdf3, "__mips16_subdf3"},
-    {RTLIB::SUB_F32, RTLIB::__mips16_subsf3, "__mips16_subsf3"},
-    {RTLIB::FPROUND_F64_F32, RTLIB::__mips16_truncdfsf2, "__mips16_truncdfsf2"},
-    {RTLIB::UO_F64, RTLIB::__mips16_unorddf2, "__mips16_unorddf2"},
-    {RTLIB::UO_F32, RTLIB::__mips16_unordsf2, "__mips16_unordsf2"}};
+    {RTLIB::SUB_F64, RTLIB::impl___mips16_subdf3, "__mips16_subdf3"},
+    {RTLIB::SUB_F32, RTLIB::impl___mips16_subsf3, "__mips16_subsf3"},
+    {RTLIB::FPROUND_F64_F32, RTLIB::impl___mips16_truncdfsf2,
+     "__mips16_truncdfsf2"},
+    {RTLIB::UO_F64, RTLIB::impl___mips16_unorddf2, "__mips16_unorddf2"},
+    {RTLIB::UO_F32, RTLIB::impl___mips16_unordsf2, "__mips16_unordsf2"}};
 
 static const Mips16IntrinsicHelperType Mips16IntrinsicHelper[] = {
   {"__fixunsdfsi", "__mips16_call_stub_2" },
diff --git a/llvm/lib/Target/Mips/Mips16InstrInfo.td b/llvm/lib/Target/Mips/Mips16InstrInfo.td
index fb2a83dc90ea..ab473c133b8e 100644
--- a/llvm/lib/Target/Mips/Mips16InstrInfo.td
+++ b/llvm/lib/Target/Mips/Mips16InstrInfo.td
@@ -374,8 +374,8 @@ class FRR16_JALRC_RA_only_ins<bits<1> nd_, bits<1> l_,
 
 class FRR16_JALRC_ins<bits<1> nd, bits<1> l, bits<1> ra,
                       string asmstr, InstrItinClass itin>:
-  FRR16_JALRC<nd, l, ra, (outs), (ins CPU16Regs:$rs),
-              !strconcat(asmstr, "\t$rs"), [], itin> ;
+  FRR16_JALRC<nd, l, ra, (outs), (ins CPU16Regs:$rx),
+              !strconcat(asmstr, "\t$rx"), [], itin> ;
 
 class FRR_SF16_ins
   <bits<5> _funct, bits<3> _subfunc,
@@ -776,7 +776,6 @@ def JrcRa16: FRR16_JALRC_RA_only_ins<1, 1, "jrc", IIM16Alu> {
 }
 
 def JrcRx16: FRR16_JALRC_ins<1, 1, 0, "jrc", IIM16Alu> {
-  let rx = 0b000;
   let isBranch = 1;
   let isIndirectBranch = 1;
   let isTerminator=1;
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index ae91c97e2a80..9d8b9f86daf7 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -967,8 +967,7 @@ void MipsAsmPrinter::EmitFPCallStub(
   // freed) and since we're at the global level we can use the default
   // constructed subtarget.
   std::unique_ptr<MCSubtargetInfo> STI(TM.getTarget().createMCSubtargetInfo(
-      TM.getTargetTriple().str(), TM.getTargetCPU(),
-      TM.getTargetFeatureString()));
+      TM.getTargetTriple(), TM.getTargetCPU(), TM.getTargetFeatureString()));
 
   //
   // .global xxxx
diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.cpp b/llvm/lib/Target/Mips/MipsInstrInfo.cpp
index 8a59532ba578..bffdffa4af6a 100644
--- a/llvm/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsInstrInfo.cpp
@@ -40,7 +40,7 @@ using namespace llvm;
 void MipsInstrInfo::anchor() {}
 
 MipsInstrInfo::MipsInstrInfo(const MipsSubtarget &STI, unsigned UncondBr)
-    : MipsGenInstrInfo(Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP),
+    : MipsGenInstrInfo(STI, Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP),
       Subtarget(STI), UncondBrOpc(UncondBr) {}
 
 const MipsInstrInfo *MipsInstrInfo::create(MipsSubtarget &STI) {
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index ee1ca4538554..f9bdc0993533 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -290,7 +290,8 @@ void NVPTXInstPrinter::printAtomicCode(const MCInst *MI, int OpNum,
       O << ".acq_rel";
       return;
     case NVPTX::Ordering::SequentiallyConsistent:
-      O << ".seq_cst";
+      report_fatal_error(
+          "NVPTX AtomicCode Printer does not support \"seq_cst\" ordering.");
       return;
     case NVPTX::Ordering::Volatile:
       O << ".volatile";
diff --git a/llvm/lib/Target/NVPTX/NVPTX.td b/llvm/lib/Target/NVPTX/NVPTX.td
index 8a445f82e700..31c117a8c0fe 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.td
+++ b/llvm/lib/Target/NVPTX/NVPTX.td
@@ -80,9 +80,9 @@ class FeaturePTX<int version>:
 //  + Compare within the family by comparing FullSMVersion, given both belongs to
 //    the same family.
 //  + Detect 'a' variants by checking FullSMVersion & 1.
-foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53,
-              60, 61, 62, 70, 72, 75, 80, 86, 87,
-              89, 90, 100, 101, 103, 120, 121] in {
+foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53, 60,
+              61, 62, 70, 72, 75, 80, 86, 87, 88, 89,
+              90, 100, 101, 103, 110, 120, 121] in {
   // Base SM version (e.g. FullSMVersion for sm_100 is 1000)
   def SM#sm : FeatureSM<""#sm, !mul(sm, 10)>;
 
@@ -127,6 +127,7 @@ def : Proc<"sm_75",   [SM75, PTX63]>;
 def : Proc<"sm_80",   [SM80, PTX70]>;
 def : Proc<"sm_86",   [SM86, PTX71]>;
 def : Proc<"sm_87",   [SM87, PTX74]>;
+def : Proc<"sm_88",   [SM88, PTX90]>;
 def : Proc<"sm_89",   [SM89, PTX78]>;
 def : Proc<"sm_90",   [SM90, PTX78]>;
 def : Proc<"sm_90a",  [SM90a, PTX80]>;
@@ -139,6 +140,9 @@ def : Proc<"sm_101f", [SM101f, PTX88]>;
 def : Proc<"sm_103",  [SM103, PTX88]>;
 def : Proc<"sm_103a", [SM103a, PTX88]>;
 def : Proc<"sm_103f", [SM103f, PTX88]>;
+def : Proc<"sm_110",  [SM110, PTX90]>;
+def : Proc<"sm_110a", [SM110a, PTX90]>;
+def : Proc<"sm_110f", [SM110f, PTX90]>;
 def : Proc<"sm_120",  [SM120, PTX87]>;
 def : Proc<"sm_120a", [SM120a, PTX87]>;
 def : Proc<"sm_120f", [SM120f, PTX88]>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 7391c2d488b5..14ca867023e2 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -432,7 +432,7 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
   // .maxclusterrank directive requires SM_90 or higher, make sure that we
   // filter it out for lower SM versions, as it causes a hard ptxas crash.
   const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM);
-  const auto *STI = static_cast<const NVPTXSubtarget *>(NTM.getSubtargetImpl());
+  const NVPTXSubtarget *STI = &NTM.getSubtarget<NVPTXSubtarget>(F);
 
   if (STI->getSmVersion() >= 90) {
     const auto ClusterDim = getClusterDim(F);
@@ -669,7 +669,7 @@ void NVPTXAsmPrinter::emitStartOfAsmFile(Module &M) {
   // rest of NVPTX isn't friendly to change subtargets per function and
   // so the default TargetMachine will have all of the options.
   const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM);
-  const auto* STI = static_cast<const NVPTXSubtarget*>(NTM.getSubtargetImpl());
+  const NVPTXSubtarget *STI = NTM.getSubtargetImpl();
   SmallString<128> Str1;
   raw_svector_ostream OS1(Str1);
 
@@ -680,8 +680,7 @@ void NVPTXAsmPrinter::emitStartOfAsmFile(Module &M) {
 
 bool NVPTXAsmPrinter::doInitialization(Module &M) {
   const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM);
-  const NVPTXSubtarget &STI =
-      *static_cast<const NVPTXSubtarget *>(NTM.getSubtargetImpl());
+  const NVPTXSubtarget &STI = *NTM.getSubtargetImpl();
   if (M.alias_size() && (STI.getPTXVersion() < 63 || STI.getSmVersion() < 30))
     report_fatal_error(".alias requires PTX version >= 6.3 and sm_30");
 
@@ -716,8 +715,7 @@ void NVPTXAsmPrinter::emitGlobals(const Module &M) {
   assert(GVVisiting.size() == 0 && "Did not fully process a global variable");
 
   const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM);
-  const NVPTXSubtarget &STI =
-      *static_cast<const NVPTXSubtarget *>(NTM.getSubtargetImpl());
+  const NVPTXSubtarget &STI = *NTM.getSubtargetImpl();
 
   // Print out module-level global variables in proper order
   for (const GlobalVariable *GV : Globals)
@@ -1178,8 +1176,7 @@ void NVPTXAsmPrinter::emitDemotedVars(const Function *F, raw_ostream &O) {
   ArrayRef<const GlobalVariable *> GVars = It->second;
 
   const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM);
-  const NVPTXSubtarget &STI =
-      *static_cast<const NVPTXSubtarget *>(NTM.getSubtargetImpl());
+  const NVPTXSubtarget &STI = *NTM.getSubtargetImpl();
 
   for (const GlobalVariable *GV : GVars) {
     O << "\t// demoted variable\n\t";
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 3300ed9a5a81..c70f48af33cf 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -170,6 +170,10 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
     }
     break;
   }
+  case NVPTXISD::ATOMIC_CMP_SWAP_B128:
+  case NVPTXISD::ATOMIC_SWAP_B128:
+    selectAtomicSwap128(N);
+    return;
   case ISD::FADD:
   case ISD::FMUL:
   case ISD::FSUB:
@@ -1097,11 +1101,6 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
   if (PlainLoad && PlainLoad->isIndexed())
     return false;
 
-  const EVT LoadedEVT = LD->getMemoryVT();
-  if (!LoadedEVT.isSimple())
-    return false;
-  const MVT LoadedVT = LoadedEVT.getSimpleVT();
-
   // Address Space Setting
   const auto CodeAddrSpace = getAddrSpace(LD);
   if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace))
@@ -1111,7 +1110,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
   SDValue Chain = N->getOperand(0);
   const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, LD);
 
-  const unsigned FromTypeWidth = LoadedVT.getSizeInBits();
+  const unsigned FromTypeWidth = LD->getMemoryVT().getSizeInBits();
 
   // Vector Setting
   const unsigned FromType =
@@ -1165,9 +1164,6 @@ static unsigned getStoreVectorNumElts(SDNode *N) {
 
 bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
   MemSDNode *LD = cast<MemSDNode>(N);
-  const EVT MemEVT = LD->getMemoryVT();
-  if (!MemEVT.isSimple())
-    return false;
 
   // Address Space Setting
   const auto CodeAddrSpace = getAddrSpace(LD);
@@ -1237,10 +1233,6 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
 }
 
 bool NVPTXDAGToDAGISel::tryLDG(MemSDNode *LD) {
-  const EVT LoadedEVT = LD->getMemoryVT();
-  if (!LoadedEVT.isSimple())
-    return false;
-
   SDLoc DL(LD);
 
   unsigned ExtensionType;
@@ -1357,10 +1349,6 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
   if (PlainStore && PlainStore->isIndexed())
     return false;
 
-  const EVT StoreVT = ST->getMemoryVT();
-  if (!StoreVT.isSimple())
-    return false;
-
   // Address Space Setting
   const auto CodeAddrSpace = getAddrSpace(ST);
 
@@ -1369,7 +1357,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
   const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST);
 
   // Vector Setting
-  const unsigned ToTypeWidth = StoreVT.getSimpleVT().getSizeInBits();
+  const unsigned ToTypeWidth = ST->getMemoryVT().getSizeInBits();
 
   // Create the machine instruction DAG
   SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
@@ -1406,8 +1394,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
 
 bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
   MemSDNode *ST = cast<MemSDNode>(N);
-  const EVT StoreVT = ST->getMemoryVT();
-  assert(StoreVT.isSimple() && "Store value is not simple");
+  const unsigned TotalWidth = ST->getMemoryVT().getSizeInBits();
 
   // Address Space Setting
   const auto CodeAddrSpace = getAddrSpace(ST);
@@ -1420,10 +1407,6 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
   SDValue Chain = ST->getChain();
   const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST);
 
-  // Type Setting: toType + toTypeWidth
-  // - for integer type, always use 'u'
-  const unsigned TotalWidth = StoreVT.getSimpleVT().getSizeInBits();
-
   const unsigned NumElts = getStoreVectorNumElts(ST);
 
   SmallVector<SDValue, 16> Ops;
@@ -2337,3 +2320,30 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
   }
   }
 }
+
+void NVPTXDAGToDAGISel::selectAtomicSwap128(SDNode *N) {
+  MemSDNode *AN = cast<MemSDNode>(N);
+  SDLoc dl(N);
+
+  const SDValue Chain = N->getOperand(0);
+  const auto [Base, Offset] = selectADDR(N->getOperand(1), CurDAG);
+  SmallVector<SDValue, 5> Ops{Base, Offset};
+  Ops.append(N->op_begin() + 2, N->op_end());
+  Ops.append({
+      getI32Imm(getMemOrder(AN), dl),
+      getI32Imm(getAtomicScope(AN), dl),
+      getI32Imm(getAddrSpace(AN), dl),
+      Chain,
+  });
+
+  assert(N->getOpcode() == NVPTXISD::ATOMIC_CMP_SWAP_B128 ||
+         N->getOpcode() == NVPTXISD::ATOMIC_SWAP_B128);
+  unsigned Opcode = N->getOpcode() == NVPTXISD::ATOMIC_SWAP_B128
+                        ? NVPTX::ATOM_EXCH_B128
+                        : NVPTX::ATOM_CAS_B128;
+
+  auto *ATOM = CurDAG->getMachineNode(Opcode, dl, N->getVTList(), Ops);
+  CurDAG->setNodeMemRefs(ATOM, AN->getMemOperand());
+
+  ReplaceNode(N, ATOM);
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index e2ad55bc1796..8dcd5362c451 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -90,6 +90,7 @@ private:
                                            bool IsIm2Col = false);
   void SelectTcgen05Ld(SDNode *N, bool hasOffset = false);
   void SelectTcgen05St(SDNode *N, bool hasOffset = false);
+  void selectAtomicSwap128(SDNode *N);
 
   inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
     return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index bb4bb1195f78..d3fb657851fe 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -198,6 +198,12 @@ static bool IsPTXVectorType(MVT VT) {
 static std::optional<std::pair<unsigned int, MVT>>
 getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI,
                        unsigned AddressSpace) {
+  const bool CanLowerTo256Bit = STI.has256BitVectorLoadStore(AddressSpace);
+
+  if (CanLowerTo256Bit && VectorEVT.isScalarInteger() &&
+      VectorEVT.getSizeInBits() == 256)
+    return {{4, MVT::i64}};
+
   if (!VectorEVT.isSimple())
     return std::nullopt;
   const MVT VectorVT = VectorEVT.getSimpleVT();
@@ -214,8 +220,6 @@ getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI,
   // The size of the PTX virtual register that holds a packed type.
   unsigned PackRegSize;
 
-  bool CanLowerTo256Bit = STI.has256BitVectorLoadStore(AddressSpace);
-
   // We only handle "native" vector sizes for now, e.g. <4 x double> is not
   // legal.  We can (and should) split that into 2 stores of <2 x double> here
   // but I'm leaving that as a TODO for now.
@@ -539,6 +543,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
     case ISD::FMINNUM_IEEE:
     case ISD::FMAXIMUM:
     case ISD::FMINIMUM:
+    case ISD::FMAXIMUMNUM:
+    case ISD::FMINIMUMNUM:
       IsOpSupported &= STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
       break;
     case ISD::FEXP2:
@@ -702,57 +708,66 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   // intrinsics.
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
 
-  // Turn FP extload into load/fpextend
-  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
-  // Turn FP truncstore into trunc + store.
-  // FIXME: vector types should also be expanded
-  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
-  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
-  setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
-  setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
-  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
-  setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
-  setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
+  // FP extload/truncstore is not legal in PTX. We need to expand all these.
+  for (auto FloatVTs :
+       {MVT::fp_valuetypes(), MVT::fp_fixedlen_vector_valuetypes()}) {
+    for (MVT ValVT : FloatVTs) {
+      for (MVT MemVT : FloatVTs) {
+        setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Expand);
+        setTruncStoreAction(ValVT, MemVT, Expand);
+      }
+    }
+  }
 
-  // PTX does not support load / store predicate registers
-  setOperationAction(ISD::LOAD, MVT::i1, Custom);
-  setOperationAction(ISD::STORE, MVT::i1, Custom);
+  // To improve CodeGen we'll legalize any-extend loads to zext loads. This is
+  // how they'll be lowered in ISel anyway, and by doing this a little earlier
+  // we allow for more DAG combine opportunities.
+  for (auto IntVTs :
+       {MVT::integer_valuetypes(), MVT::integer_fixedlen_vector_valuetypes()})
+    for (MVT ValVT : IntVTs)
+      for (MVT MemVT : IntVTs)
+        if (isTypeLegal(ValVT))
+          setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Custom);
 
+  // PTX does not support load / store predicate registers
+  setOperationAction({ISD::LOAD, ISD::STORE}, MVT::i1, Custom);
   for (MVT VT : MVT::integer_valuetypes()) {
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction({ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, VT, MVT::i1,
+                     Promote);
     setTruncStoreAction(VT, MVT::i1, Expand);
   }
 
+  // Disable generations of extload/truncstore for v2i16/v2i8. The generic
+  // expansion for these nodes when they are unaligned is incorrect if the
+  // type is a vector.
+  //
+  // TODO: Fix the generic expansion for these nodes found in
+  //       TargetLowering::expandUnalignedLoad/Store.
+  setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16,
+                   MVT::v2i8, Expand);
+  setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
+
+  // Register custom handling for illegal type loads/stores. We'll try to custom
+  // lower almost all illegal types and logic in the lowering will discard cases
+  // we can't handle.
+  setOperationAction({ISD::LOAD, ISD::STORE}, {MVT::i128, MVT::f128}, Custom);
+  for (MVT VT : MVT::fixedlen_vector_valuetypes())
+    if (!isTypeLegal(VT) && VT.getStoreSizeInBits() <= 256)
+      setOperationAction({ISD::STORE, ISD::LOAD}, VT, Custom);
+
+  // Custom legalization for LDU intrinsics.
+  // TODO: The logic to lower these is not very robust and we should rewrite it.
+  //       Perhaps LDU should not be represented as an intrinsic at all.
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
+  for (MVT VT : MVT::fixedlen_vector_valuetypes())
+    if (IsPTXVectorType(VT))
+      setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);
+
   setCondCodeAction({ISD::SETNE, ISD::SETEQ, ISD::SETUGE, ISD::SETULE,
                      ISD::SETUGT, ISD::SETULT, ISD::SETGT, ISD::SETLT,
                      ISD::SETGE, ISD::SETLE},
                     MVT::i1, Expand);
 
-  // expand extload of vector of integers.
-  setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16,
-                   MVT::v2i8, Expand);
-  setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
-
   // This is legal in NVPTX
   setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
@@ -767,24 +782,12 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   // DEBUGTRAP can be lowered to PTX brkpt
   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
 
-  // Register custom handling for vector loads/stores
-  for (MVT VT : MVT::fixedlen_vector_valuetypes())
-    if (IsPTXVectorType(VT))
-      setOperationAction({ISD::LOAD, ISD::STORE, ISD::INTRINSIC_W_CHAIN}, VT,
-                         Custom);
-
-  setOperationAction({ISD::LOAD, ISD::STORE, ISD::INTRINSIC_W_CHAIN},
-                     {MVT::i128, MVT::f128}, Custom);
-
   // Support varargs.
   setOperationAction(ISD::VASTART, MVT::Other, Custom);
   setOperationAction(ISD::VAARG, MVT::Other, Custom);
   setOperationAction(ISD::VACOPY, MVT::Other, Expand);
   setOperationAction(ISD::VAEND, MVT::Other, Expand);
 
-  // Custom handling for i8 intrinsics
-  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
-
   setOperationAction({ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX},
                      {MVT::i16, MVT::i32, MVT::i64}, Legal);
 
@@ -988,7 +991,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   if (getOperationAction(ISD::FABS, MVT::bf16) == Promote)
     AddPromotedToType(ISD::FABS, MVT::bf16, MVT::f32);
 
-  for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) {
+  for (const auto &Op :
+       {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM}) {
     setOperationAction(Op, MVT::f32, Legal);
     setOperationAction(Op, MVT::f64, Legal);
     setFP16OperationAction(Op, MVT::f16, Legal, Promote);
@@ -1039,7 +1043,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
 
   setOperationAction(ISD::ATOMIC_LOAD_SUB, {MVT::i32, MVT::i64}, Expand);
-  // No FPOW or FREM in PTX.
+
+  // atom.b128 is legal in PTX but since we don't represent i128 as a legal
+  // type, we need to custom lower it.
+  setOperationAction({ISD::ATOMIC_CMP_SWAP, ISD::ATOMIC_SWAP}, MVT::i128,
+                     Custom);
 
   // Now deduce the information based on the above mentioned
   // actions
@@ -1047,7 +1055,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
 
   // PTX support for 16-bit CAS is emulated. Only use 32+
   setMinCmpXchgSizeInBits(STI.getMinCmpXchgSizeInBits());
-  setMaxAtomicSizeInBitsSupported(64);
+  setMaxAtomicSizeInBitsSupported(STI.hasAtomSwap128() ? 128 : 64);
   setMaxDivRemBitWidthSupported(64);
 
   // Custom lowering for tcgen05.ld vector operands
@@ -1080,6 +1088,8 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case NVPTXISD::FIRST_NUMBER:
     break;
 
+    MAKE_CASE(NVPTXISD::ATOMIC_CMP_SWAP_B128)
+    MAKE_CASE(NVPTXISD::ATOMIC_SWAP_B128)
     MAKE_CASE(NVPTXISD::RET_GLUE)
     MAKE_CASE(NVPTXISD::DeclareArrayParam)
     MAKE_CASE(NVPTXISD::DeclareScalarParam)
@@ -3088,29 +3098,112 @@ SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
                       MachinePointerInfo(SV));
 }
 
-static void replaceLoadVector(SDNode *N, SelectionDAG &DAG,
-                              SmallVectorImpl<SDValue> &Results,
-                              const NVPTXSubtarget &STI);
+/// replaceLoadVector - Convert vector loads into multi-output scalar loads.
+static std::optional<std::pair<SDValue, SDValue>>
+replaceLoadVector(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI) {
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  const EVT ResVT = LD->getValueType(0);
+  const EVT MemVT = LD->getMemoryVT();
 
-SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
-  if (Op.getValueType() == MVT::i1)
-    return LowerLOADi1(Op, DAG);
+  // If we're doing sign/zero extension as part of the load, avoid lowering to
+  // a LoadV node. TODO: consider relaxing this restriction.
+  if (ResVT != MemVT)
+    return std::nullopt;
 
-  EVT VT = Op.getValueType();
+  const auto NumEltsAndEltVT =
+      getVectorLoweringShape(ResVT, STI, LD->getAddressSpace());
+  if (!NumEltsAndEltVT)
+    return std::nullopt;
+  const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
+
+  Align Alignment = LD->getAlign();
+  const auto &TD = DAG.getDataLayout();
+  Align PrefAlign = TD.getPrefTypeAlign(MemVT.getTypeForEVT(*DAG.getContext()));
+  if (Alignment < PrefAlign) {
+    // This load is not sufficiently aligned, so bail out and let this vector
+    // load be scalarized.  Note that we may still be able to emit smaller
+    // vector loads.  For example, if we are loading a <4 x float> with an
+    // alignment of 8, this check will fail but the legalizer will try again
+    // with 2 x <2 x float>, which will succeed with an alignment of 8.
+    return std::nullopt;
+  }
+
+  // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
+  // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
+  // loaded type to i16 and propagate the "real" type as the memory type.
+  const MVT LoadEltVT = (EltVT.getSizeInBits() < 16) ? MVT::i16 : EltVT;
+
+  unsigned Opcode;
+  switch (NumElts) {
+  default:
+    return std::nullopt;
+  case 2:
+    Opcode = NVPTXISD::LoadV2;
+    break;
+  case 4:
+    Opcode = NVPTXISD::LoadV4;
+    break;
+  case 8:
+    Opcode = NVPTXISD::LoadV8;
+    break;
+  }
+  auto ListVTs = SmallVector<EVT, 9>(NumElts, LoadEltVT);
+  ListVTs.push_back(MVT::Other);
+  SDVTList LdResVTs = DAG.getVTList(ListVTs);
 
-  if (NVPTX::isPackedVectorTy(VT)) {
-    // v2f32/v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to
-    // handle unaligned loads and have to handle it here.
-    LoadSDNode *Load = cast<LoadSDNode>(Op);
-    EVT MemVT = Load->getMemoryVT();
-    if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
-                                        MemVT, *Load->getMemOperand())) {
-      SDValue Ops[2];
-      std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
-      return DAG.getMergeValues(Ops, SDLoc(Op));
+  SDLoc DL(LD);
+
+  // Copy regular operands
+  SmallVector<SDValue, 8> OtherOps(LD->ops());
+
+  // The select routine does not have access to the LoadSDNode instance, so
+  // pass along the extension information
+  OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
+
+  SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, MemVT,
+                                          LD->getMemOperand());
+
+  SmallVector<SDValue> ScalarRes;
+  if (EltVT.isVector()) {
+    assert(EVT(EltVT.getVectorElementType()) == ResVT.getVectorElementType());
+    assert(NumElts * EltVT.getVectorNumElements() ==
+           ResVT.getVectorNumElements());
+    // Generate EXTRACT_VECTOR_ELTs to split v2[i,f,bf]16/v4i8 subvectors back
+    // into individual elements.
+    for (const unsigned I : llvm::seq(NumElts)) {
+      SDValue SubVector = NewLD.getValue(I);
+      DAG.ExtractVectorElements(SubVector, ScalarRes);
+    }
+  } else {
+    for (const unsigned I : llvm::seq(NumElts)) {
+      SDValue Res = NewLD.getValue(I);
+      if (LoadEltVT != EltVT)
+        Res = DAG.getNode(ISD::TRUNCATE, DL, EltVT, Res);
+      ScalarRes.push_back(Res);
     }
   }
 
+  SDValue LoadChain = NewLD.getValue(NumElts);
+
+  const MVT BuildVecVT =
+      MVT::getVectorVT(EltVT.getScalarType(), ScalarRes.size());
+  SDValue BuildVec = DAG.getBuildVector(BuildVecVT, DL, ScalarRes);
+  SDValue LoadValue = DAG.getBitcast(ResVT, BuildVec);
+
+  return {{LoadValue, LoadChain}};
+}
+
+static void replaceLoadVector(SDNode *N, SelectionDAG &DAG,
+                              SmallVectorImpl<SDValue> &Results,
+                              const NVPTXSubtarget &STI) {
+  if (auto Res = replaceLoadVector(N, DAG, STI))
+    Results.append({Res->first, Res->second});
+}
+
+static SDValue lowerLoadVector(SDNode *N, SelectionDAG &DAG,
+                               const NVPTXSubtarget &STI) {
+  if (auto Res = replaceLoadVector(N, DAG, STI))
+    return DAG.getMergeValues({Res->first, Res->second}, SDLoc(N));
   return SDValue();
 }
 
@@ -3118,13 +3211,10 @@ SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
 //   =>
 // v1 = ld i8* addr (-> i16)
 // v = trunc i16 to i1
-SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
-  SDNode *Node = Op.getNode();
-  LoadSDNode *LD = cast<LoadSDNode>(Node);
-  SDLoc dl(Node);
+static SDValue lowerLOADi1(LoadSDNode *LD, SelectionDAG &DAG) {
+  SDLoc dl(LD);
   assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
-  assert(Node->getValueType(0) == MVT::i1 &&
-         "Custom lowering for i1 load only");
+  assert(LD->getValueType(0) == MVT::i1 && "Custom lowering for i1 load only");
   SDValue newLD = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i16, LD->getChain(),
                                  LD->getBasePtr(), LD->getPointerInfo(),
                                  MVT::i8, LD->getAlign(),
@@ -3133,35 +3223,31 @@ SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
   // The legalizer (the caller) is expecting two values from the legalized
   // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
   // in LegalizeDAG.cpp which also uses MergeValues.
-  SDValue Ops[] = { result, LD->getChain() };
-  return DAG.getMergeValues(Ops, dl);
+  return DAG.getMergeValues({result, LD->getChain()}, dl);
 }
 
-SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
-  StoreSDNode *Store = cast<StoreSDNode>(Op);
-  EVT VT = Store->getMemoryVT();
-
-  if (VT == MVT::i1)
-    return LowerSTOREi1(Op, DAG);
+SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  LoadSDNode *LD = cast<LoadSDNode>(Op);
 
-  // v2f32/v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to
-  // handle unaligned stores and have to handle it here.
-  if (NVPTX::isPackedVectorTy(VT) &&
-      !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
-                                      VT, *Store->getMemOperand()))
-    return expandUnalignedStore(Store, DAG);
+  if (Op.getValueType() == MVT::i1)
+    return lowerLOADi1(LD, DAG);
 
-  // v2f16/v2bf16/v2i16 don't need special handling.
-  if (NVPTX::isPackedVectorTy(VT) && VT.is32BitVector())
-    return SDValue();
+  // To improve CodeGen we'll legalize any-extend loads to zext loads. This is
+  // how they'll be lowered in ISel anyway, and by doing this a little earlier
+  // we allow for more DAG combine opportunities.
+  if (LD->getExtensionType() == ISD::EXTLOAD) {
+    assert(LD->getValueType(0).isInteger() && LD->getMemoryVT().isInteger() &&
+           "Unexpected fpext-load");
+    return DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(Op), Op.getValueType(),
+                          LD->getChain(), LD->getBasePtr(), LD->getMemoryVT(),
+                          LD->getMemOperand());
+  }
 
-  // Lower store of any other vector type, including v2f32 as we want to break
-  // it apart since this is not a widely-supported type.
-  return LowerSTOREVector(Op, DAG);
+  llvm_unreachable("Unexpected custom lowering for load");
 }
 
-SDValue
-NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
+static SDValue lowerSTOREVector(SDValue Op, SelectionDAG &DAG,
+                                const NVPTXSubtarget &STI) {
   MemSDNode *N = cast<MemSDNode>(Op.getNode());
   SDValue Val = N->getOperand(1);
   SDLoc DL(N);
@@ -3253,6 +3339,18 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
   return NewSt;
 }
 
+SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  StoreSDNode *Store = cast<StoreSDNode>(Op);
+  EVT VT = Store->getMemoryVT();
+
+  if (VT == MVT::i1)
+    return LowerSTOREi1(Op, DAG);
+
+  // Lower store of any other vector type, including v2f32 as we want to break
+  // it apart since this is not a widely-supported type.
+  return lowerSTOREVector(Op, DAG, STI);
+}
+
 // st i1 v, addr
 //    =>
 // v1 = zxt v to i16
@@ -4010,14 +4108,8 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
   case Intrinsic::nvvm_ldu_global_i:
   case Intrinsic::nvvm_ldu_global_f:
   case Intrinsic::nvvm_ldu_global_p: {
-    auto &DL = I.getDataLayout();
     Info.opc = ISD::INTRINSIC_W_CHAIN;
-    if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
-      Info.memVT = getValueType(DL, I.getType());
-    else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
-      Info.memVT = getPointerTy(DL);
-    else
-      Info.memVT = getValueType(DL, I.getType());
+    Info.memVT = getValueType(I.getDataLayout(), I.getType());
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad;
@@ -5152,11 +5244,34 @@ static SDValue combinePackingMovIntoStore(SDNode *N,
                                      ST->getMemoryVT(), ST->getMemOperand());
 }
 
-static SDValue PerformStoreCombine(SDNode *N,
-                                   TargetLowering::DAGCombinerInfo &DCI) {
+static SDValue combineSTORE(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+                            const NVPTXSubtarget &STI) {
+
+  if (DCI.isBeforeLegalize() && N->getOpcode() == ISD::STORE) {
+    // Here is our chance to custom lower a store with a non-simple type.
+    // Unfortunately, we can't do this in the legalizer because there is no
+    // way to setOperationAction for an non-simple type.
+    StoreSDNode *ST = cast<StoreSDNode>(N);
+    if (!ST->getValue().getValueType().isSimple())
+      return lowerSTOREVector(SDValue(ST, 0), DCI.DAG, STI);
+  }
+
   return combinePackingMovIntoStore(N, DCI, 1, 2);
 }
 
+static SDValue combineLOAD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+                           const NVPTXSubtarget &STI) {
+  if (DCI.isBeforeLegalize() && N->getOpcode() == ISD::LOAD) {
+    // Here is our chance to custom lower a load with a non-simple type.
+    // Unfortunately, we can't do this in the legalizer because there is no
+    // way to setOperationAction for an non-simple type.
+    if (!N->getValueType(0).isSimple())
+      return lowerLoadVector(N, DCI.DAG, STI);
+  }
+
+  return combineUnpackingMovIntoLoad(N, DCI);
+}
+
 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
 ///
 static SDValue PerformADDCombine(SDNode *N,
@@ -5884,7 +5999,7 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::LOAD:
   case NVPTXISD::LoadV2:
   case NVPTXISD::LoadV4:
-    return combineUnpackingMovIntoLoad(N, DCI);
+    return combineLOAD(N, DCI, STI);
   case ISD::MUL:
     return PerformMULCombine(N, DCI, OptLevel);
   case NVPTXISD::PRMT:
@@ -5901,7 +6016,7 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::STORE:
   case NVPTXISD::StoreV2:
   case NVPTXISD::StoreV4:
-    return PerformStoreCombine(N, DCI);
+    return combineSTORE(N, DCI, STI);
   case ISD::VSELECT:
     return PerformVSELECTCombine(N, DCI);
   }
@@ -5930,103 +6045,6 @@ static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG,
       DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i8, {Vec0, Vec1}));
 }
 
-/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
-static void replaceLoadVector(SDNode *N, SelectionDAG &DAG,
-                              SmallVectorImpl<SDValue> &Results,
-                              const NVPTXSubtarget &STI) {
-  LoadSDNode *LD = cast<LoadSDNode>(N);
-  const EVT ResVT = LD->getValueType(0);
-  const EVT MemVT = LD->getMemoryVT();
-
-  // If we're doing sign/zero extension as part of the load, avoid lowering to
-  // a LoadV node. TODO: consider relaxing this restriction.
-  if (ResVT != MemVT)
-    return;
-
-  const auto NumEltsAndEltVT =
-      getVectorLoweringShape(ResVT, STI, LD->getAddressSpace());
-  if (!NumEltsAndEltVT)
-    return;
-  const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
-
-  Align Alignment = LD->getAlign();
-  const auto &TD = DAG.getDataLayout();
-  Align PrefAlign = TD.getPrefTypeAlign(MemVT.getTypeForEVT(*DAG.getContext()));
-  if (Alignment < PrefAlign) {
-    // This load is not sufficiently aligned, so bail out and let this vector
-    // load be scalarized.  Note that we may still be able to emit smaller
-    // vector loads.  For example, if we are loading a <4 x float> with an
-    // alignment of 8, this check will fail but the legalizer will try again
-    // with 2 x <2 x float>, which will succeed with an alignment of 8.
-    return;
-  }
-
-  // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
-  // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
-  // loaded type to i16 and propagate the "real" type as the memory type.
-  const MVT LoadEltVT = (EltVT.getSizeInBits() < 16) ? MVT::i16 : EltVT;
-
-  unsigned Opcode;
-  switch (NumElts) {
-  default:
-    return;
-  case 2:
-    Opcode = NVPTXISD::LoadV2;
-    break;
-  case 4:
-    Opcode = NVPTXISD::LoadV4;
-    break;
-  case 8:
-    Opcode = NVPTXISD::LoadV8;
-    break;
-  }
-  auto ListVTs = SmallVector<EVT, 9>(NumElts, LoadEltVT);
-  ListVTs.push_back(MVT::Other);
-  SDVTList LdResVTs = DAG.getVTList(ListVTs);
-
-  SDLoc DL(LD);
-
-  // Copy regular operands
-  SmallVector<SDValue, 8> OtherOps(LD->ops());
-
-  // The select routine does not have access to the LoadSDNode instance, so
-  // pass along the extension information
-  OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
-
-  SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
-                                          LD->getMemoryVT(),
-                                          LD->getMemOperand());
-
-  SmallVector<SDValue> ScalarRes;
-  if (EltVT.isVector()) {
-    assert(EVT(EltVT.getVectorElementType()) == ResVT.getVectorElementType());
-    assert(NumElts * EltVT.getVectorNumElements() ==
-           ResVT.getVectorNumElements());
-    // Generate EXTRACT_VECTOR_ELTs to split v2[i,f,bf]16/v4i8 subvectors back
-    // into individual elements.
-    for (const unsigned I : llvm::seq(NumElts)) {
-      SDValue SubVector = NewLD.getValue(I);
-      DAG.ExtractVectorElements(SubVector, ScalarRes);
-    }
-  } else {
-    for (const unsigned I : llvm::seq(NumElts)) {
-      SDValue Res = NewLD.getValue(I);
-      if (LoadEltVT != EltVT)
-        Res = DAG.getNode(ISD::TRUNCATE, DL, EltVT, Res);
-      ScalarRes.push_back(Res);
-    }
-  }
-
-  SDValue LoadChain = NewLD.getValue(NumElts);
-
-  const MVT BuildVecVT =
-      MVT::getVectorVT(EltVT.getScalarType(), ScalarRes.size());
-  SDValue BuildVec = DAG.getBuildVector(BuildVecVT, DL, ScalarRes);
-  SDValue LoadValue = DAG.getBitcast(ResVT, BuildVec);
-
-  Results.append({LoadValue, LoadChain});
-}
-
 // Lower vector return type of tcgen05.ld intrinsics
 static void ReplaceTcgen05Ld(SDNode *N, SelectionDAG &DAG,
                              SmallVectorImpl<SDValue> &Results,
@@ -6262,6 +6280,49 @@ static void replaceProxyReg(SDNode *N, SelectionDAG &DAG,
   Results.push_back(Res);
 }
 
+static void replaceAtomicSwap128(SDNode *N, SelectionDAG &DAG,
+                                 const NVPTXSubtarget &STI,
+                                 SmallVectorImpl<SDValue> &Results) {
+  assert(N->getValueType(0) == MVT::i128 &&
+         "Custom lowering for atomic128 only supports i128");
+
+  AtomicSDNode *AN = cast<AtomicSDNode>(N);
+  SDLoc dl(N);
+
+  if (!STI.hasAtomSwap128()) {
+    DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
+        DAG.getMachineFunction().getFunction(),
+        "Support for b128 atomics introduced in PTX ISA version 8.3 and "
+        "requires target sm_90.",
+        dl.getDebugLoc()));
+
+    Results.push_back(DAG.getUNDEF(MVT::i128));
+    Results.push_back(AN->getOperand(0)); // Chain
+    return;
+  }
+
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(AN->getOperand(0)); // Chain
+  Ops.push_back(AN->getOperand(1)); // Ptr
+  for (const auto &Op : AN->ops().drop_front(2)) {
+    // Low part
+    Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i64, Op,
+                              DAG.getIntPtrConstant(0, dl)));
+    // High part
+    Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i64, Op,
+                              DAG.getIntPtrConstant(1, dl)));
+  }
+  unsigned Opcode = N->getOpcode() == ISD::ATOMIC_SWAP
+                        ? NVPTXISD::ATOMIC_SWAP_B128
+                        : NVPTXISD::ATOMIC_CMP_SWAP_B128;
+  SDVTList Tys = DAG.getVTList(MVT::i64, MVT::i64, MVT::Other);
+  SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, MVT::i128,
+                                           AN->getMemOperand());
+  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i128,
+                                {Result.getValue(0), Result.getValue(1)}));
+  Results.push_back(Result.getValue(2));
+}
+
 void NVPTXTargetLowering::ReplaceNodeResults(
     SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
   switch (N->getOpcode()) {
@@ -6282,6 +6343,10 @@ void NVPTXTargetLowering::ReplaceNodeResults(
   case NVPTXISD::ProxyReg:
     replaceProxyReg(N, DAG, *this, Results);
     return;
+  case ISD::ATOMIC_CMP_SWAP:
+  case ISD::ATOMIC_SWAP:
+    replaceAtomicSwap128(N, DAG, STI, Results);
+    return;
   }
 }
 
@@ -6306,16 +6371,19 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   }
 
   assert(Ty->isIntegerTy() && "Ty should be integer at this point");
-  auto ITy = cast<llvm::IntegerType>(Ty);
+  const unsigned BitWidth = cast<IntegerType>(Ty)->getBitWidth();
 
   switch (AI->getOperation()) {
   default:
     return AtomicExpansionKind::CmpXChg;
+  case AtomicRMWInst::BinOp::Xchg:
+    if (BitWidth == 128)
+      return AtomicExpansionKind::None;
+    LLVM_FALLTHROUGH;
   case AtomicRMWInst::BinOp::And:
   case AtomicRMWInst::BinOp::Or:
   case AtomicRMWInst::BinOp::Xor:
-  case AtomicRMWInst::BinOp::Xchg:
-    switch (ITy->getBitWidth()) {
+    switch (BitWidth) {
     case 8:
     case 16:
       return AtomicExpansionKind::CmpXChg;
@@ -6325,6 +6393,8 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
       if (STI.hasAtomBitwise64())
         return AtomicExpansionKind::None;
       return AtomicExpansionKind::CmpXChg;
+    case 128:
+      return AtomicExpansionKind::CmpXChg;
     default:
       llvm_unreachable("unsupported width encountered");
     }
@@ -6334,7 +6404,7 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   case AtomicRMWInst::BinOp::Min:
   case AtomicRMWInst::BinOp::UMax:
   case AtomicRMWInst::BinOp::UMin:
-    switch (ITy->getBitWidth()) {
+    switch (BitWidth) {
     case 8:
     case 16:
       return AtomicExpansionKind::CmpXChg;
@@ -6344,17 +6414,20 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
       if (STI.hasAtomMinMax64())
         return AtomicExpansionKind::None;
       return AtomicExpansionKind::CmpXChg;
+    case 128:
+      return AtomicExpansionKind::CmpXChg;
     default:
       llvm_unreachable("unsupported width encountered");
     }
   case AtomicRMWInst::BinOp::UIncWrap:
   case AtomicRMWInst::BinOp::UDecWrap:
-    switch (ITy->getBitWidth()) {
+    switch (BitWidth) {
     case 32:
       return AtomicExpansionKind::None;
     case 8:
     case 16:
     case 64:
+    case 128:
       return AtomicExpansionKind::CmpXChg;
     default:
       llvm_unreachable("unsupported width encountered");
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 27f099e22097..03b3edc902e5 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -81,7 +81,17 @@ enum NodeType : unsigned {
   CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Z,
 
   FIRST_MEMORY_OPCODE,
-  LoadV2 = FIRST_MEMORY_OPCODE,
+
+  /// These nodes are used to lower atomic instructions with i128 type. They are
+  /// similar to the generic nodes, but the input and output values are split
+  /// into two 64-bit values.
+  /// ValLo, ValHi, OUTCHAIN = ATOMIC_CMP_SWAP_B128(INCHAIN, ptr, cmpLo, cmpHi,
+  ///                                               swapLo, swapHi)
+  /// ValLo, ValHi, OUTCHAIN = ATOMIC_SWAP_B128(INCHAIN, ptr, amtLo, amtHi)
+  ATOMIC_CMP_SWAP_B128 = FIRST_MEMORY_OPCODE,
+  ATOMIC_SWAP_B128,
+
+  LoadV2,
   LoadV4,
   LoadV8,
   LDUV2, // LDU.v2
@@ -309,11 +319,8 @@ private:
   SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerLOADi1(SDValue Op, SelectionDAG &DAG) const;
-
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index 34fe467c9456..6840c7ae8faf 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -12,6 +12,7 @@
 
 #include "NVPTXInstrInfo.h"
 #include "NVPTX.h"
+#include "NVPTXSubtarget.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -24,7 +25,8 @@ using namespace llvm;
 // Pin the vtable to this file.
 void NVPTXInstrInfo::anchor() {}
 
-NVPTXInstrInfo::NVPTXInstrInfo() : RegInfo() {}
+NVPTXInstrInfo::NVPTXInstrInfo(const NVPTXSubtarget &STI)
+    : NVPTXGenInstrInfo(STI), RegInfo() {}
 
 void NVPTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator I,
@@ -190,4 +192,4 @@ unsigned NVPTXInstrInfo::insertBranch(MachineBasicBlock &MBB,
   BuildMI(&MBB, DL, get(NVPTX::CBranch)).add(Cond[0]).addMBB(TBB);
   BuildMI(&MBB, DL, get(NVPTX::GOTO)).addMBB(FBB);
   return 2;
-}
-\ No newline at end of file
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
index 4e9dc9d3b468..23889531431e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
@@ -21,12 +21,13 @@
 #include "NVPTXGenInstrInfo.inc"
 
 namespace llvm {
+class NVPTXSubtarget;
 
 class NVPTXInstrInfo : public NVPTXGenInstrInfo {
   const NVPTXRegisterInfo RegInfo;
   virtual void anchor();
 public:
-  explicit NVPTXInstrInfo();
+  explicit NVPTXInstrInfo(const NVPTXSubtarget &STI);
 
   const NVPTXRegisterInfo &getRegisterInfo() const { return RegInfo; }
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 7b135098bd4c..4e38e026e6bd 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -104,6 +104,7 @@ def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">;
 def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">;
 def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">;
 def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">;
+def hasAtomSwap128 : Predicate<"Subtarget->hasAtomSwap128()">;
 def hasClusters : Predicate<"Subtarget->hasClusters()">;
 def hasPTXASUnreachableBug : Predicate<"Subtarget->hasPTXASUnreachableBug()">;
 def noPTXASUnreachableBug : Predicate<"!Subtarget->hasPTXASUnreachableBug()">;
@@ -294,7 +295,7 @@ multiclass ADD_SUB_INT_CARRY<string op_str, SDNode op_node, bit commutative> {
 //
 // Also defines ftz (flush subnormal inputs and results to sign-preserving
 // zero) variants for fp32 functions.
-multiclass FMINIMUMMAXIMUM<string OpcStr, bit NaN, SDNode OpNode> {
+multiclass FMINIMUMMAXIMUM<string OpcStr, bit NaN, SDPatternOperator OpNode> {
   defvar nan_str = !if(NaN, ".NaN", "");
   if !not(NaN) then {
    def _f64_rr :
@@ -898,10 +899,8 @@ let Predicates = [hasOptEnabled] in {
   defm MAD_LO_S32 : MADInst<"lo.s32", mul, I32RT, I32RT>;
   defm MAD_LO_S64 : MADInst<"lo.s64", mul, I64RT, I64RT>;
 
-  defm MAD_WIDE_U16 : MADInst<"wide.u16", umul_wide, I32RT, I16RT>;
-  defm MAD_WIDE_S16 : MADInst<"wide.s16", smul_wide, I32RT, I16RT>;
-  defm MAD_WIDE_U32 : MADInst<"wide.u32", umul_wide, I64RT, I32RT>;
-  defm MAD_WIDE_S32 : MADInst<"wide.s32", smul_wide, I64RT, I32RT>;
+  // Generating mad.wide causes a regression: 
+  // https://github.com/llvm/llvm-project/pull/150477#issuecomment-3191367837
 }
 
 //-----------------------------------
@@ -912,8 +911,15 @@ defm FADD : F3_fma_component<"add", fadd>;
 defm FSUB : F3_fma_component<"sub", fsub>;
 defm FMUL : F3_fma_component<"mul", fmul>;
 
-defm MIN : FMINIMUMMAXIMUM<"min", /* NaN */ false, fminnum>;
-defm MAX : FMINIMUMMAXIMUM<"max", /* NaN */ false, fmaxnum>;
+def fminnum_or_fminimumnum : PatFrags<(ops node:$a, node:$b),
+                                     [(fminnum node:$a, node:$b),
+                                      (fminimumnum node:$a, node:$b)]>;
+def fmaxnum_or_fmaximumnum : PatFrags<(ops node:$a, node:$b),
+                                     [(fmaxnum node:$a, node:$b),
+                                      (fmaximumnum node:$a, node:$b)]>;
+
+defm MIN : FMINIMUMMAXIMUM<"min", /* NaN */ false, fminnum_or_fminimumnum>;
+defm MAX : FMINIMUMMAXIMUM<"max", /* NaN */ false, fmaxnum_or_fmaximumnum>;
 defm MIN_NAN : FMINIMUMMAXIMUM<"min", /* NaN */ true, fminimum>;
 defm MAX_NAN : FMINIMUMMAXIMUM<"max", /* NaN */ true, fmaximum>;
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 4ab30a5b5f5e..c544911bdf1e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1990,19 +1990,23 @@ multiclass F_ATOMIC_3<RegTyInfo t, string op_str, SDPatternOperator op, SDNode a
 
   let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
     def _rr : BasicFlagsNVPTXInst<(outs t.RC:$dst),
-      (ins ADDR:$addr, t.RC:$b, t.RC:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
+      (ins ADDR:$addr, t.RC:$b, t.RC:$c),
+      (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
       asm_str>;
 
     def _ir : BasicFlagsNVPTXInst<(outs t.RC:$dst),
-      (ins ADDR:$addr, t.Imm:$b, t.RC:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
+      (ins ADDR:$addr, t.Imm:$b, t.RC:$c),
+      (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
       asm_str>;
 
     def _ri : BasicFlagsNVPTXInst<(outs t.RC:$dst),
-      (ins ADDR:$addr, t.RC:$b, t.Imm:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
+      (ins ADDR:$addr, t.RC:$b, t.Imm:$c),
+      (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
       asm_str>;
 
     def _ii : BasicFlagsNVPTXInst<(outs t.RC:$dst),
-      (ins ADDR:$addr, t.Imm:$b, t.Imm:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
+      (ins ADDR:$addr, t.Imm:$b, t.Imm:$c),
+      (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
       asm_str>;
   }
 
@@ -2200,6 +2204,37 @@ defm INT_PTX_SATOM_MIN  : ATOM2_minmax_impl<"min">;
 defm INT_PTX_SATOM_OR   : ATOM2_bitwise_impl<"or">;
 defm INT_PTX_SATOM_XOR  : ATOM2_bitwise_impl<"xor">;
 
+// atom.*.b128
+
+let mayLoad = true, mayStore = true, hasSideEffects = true,
+    Predicates = [hasAtomSwap128] in {
+  def ATOM_CAS_B128 :
+    NVPTXInst<
+        (outs B64:$dst0, B64:$dst1),
+        (ins ADDR:$addr, B64:$cmp0, B64:$cmp1, B64:$swap0, B64:$swap1,
+             AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
+        "{{\n\t"
+        ".reg .b128 cmp, swap, dst;\n\t"
+        "mov.b128 cmp, {$cmp0, $cmp1};\n\t"
+        "mov.b128 swap, {$swap0, $swap1};\n\t"
+        "atom${sem:sem}${scope:scope}${addsp:addsp}.cas.b128 dst, [$addr], cmp, swap;\n\t"
+        "mov.b128 {$dst0, $dst1}, dst;\n\t"
+        "}}">;
+
+  def ATOM_EXCH_B128 :
+    NVPTXInst<
+        (outs B64:$dst0, B64:$dst1),
+        (ins ADDR:$addr, B64:$amt0, B64:$amt1,
+             AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
+        "{{\n\t"
+        ".reg .b128 amt, dst;\n\t"
+        "mov.b128 amt, {$amt0, $amt1};\n\t"
+        "atom${sem:sem}${scope:scope}${addsp:addsp}.exch.b128 dst, [$addr], amt;\n\t"
+        "mov.b128 {$dst0, $dst1}, dst;\n\t"
+        "}}">;
+}
+
+
 //-----------------------------------
 // Support for ldu on sm_20 or later
 //-----------------------------------
@@ -4358,10 +4393,12 @@ let hasSideEffects = 1 in {
   def SREG_CLOCK : PTX_READ_SREG_R32<"clock", int_nvvm_read_ptx_sreg_clock>;
   def SREG_CLOCK64 : PTX_READ_SREG_R64<"clock64", int_nvvm_read_ptx_sreg_clock64>;
   def SREG_GLOBALTIMER : PTX_READ_SREG_R64<"globaltimer", int_nvvm_read_ptx_sreg_globaltimer>;
+  def SREG_GLOBALTIMER_LO : PTX_READ_SREG_R32<"globaltimer_lo", int_nvvm_read_ptx_sreg_globaltimer_lo>;
 }
 
 def: Pat <(i64 (readcyclecounter)), (SREG_CLOCK64)>;
 def: Pat <(i64 (readsteadycounter)), (SREG_GLOBALTIMER)>;
+def: Pat <(i32 (readsteadycounter)), (SREG_GLOBALTIMER_LO)>;
 
 def INT_PTX_SREG_PM0 : PTX_READ_SREG_R32<"pm0", int_nvvm_read_ptx_sreg_pm0>;
 def INT_PTX_SREG_PM1 : PTX_READ_SREG_R32<"pm1", int_nvvm_read_ptx_sreg_pm1>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
index a84ceaba991c..c5489670bd24 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -62,7 +62,7 @@ NVPTXSubtarget::NVPTXSubtarget(const Triple &TT, const std::string &CPU,
                                const NVPTXTargetMachine &TM)
     : NVPTXGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), PTXVersion(0),
       FullSmVersion(200), SmVersion(getSmVersion()),
-      TLInfo(TM, initializeSubtargetDependencies(CPU, FS)) {
+      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this) {
   TSInfo = std::make_unique<NVPTXSelectionDAGInfo>();
 }
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index acf025b70ce3..0a77a633cb25 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -82,6 +82,7 @@ public:
   bool hasAtomBitwise64() const { return SmVersion >= 32; }
   bool hasAtomMinMax64() const { return SmVersion >= 32; }
   bool hasAtomCas16() const { return SmVersion >= 70 && PTXVersion >= 63; }
+  bool hasAtomSwap128() const { return SmVersion >= 90 && PTXVersion >= 83; }
   bool hasClusters() const { return SmVersion >= 90 && PTXVersion >= 78; }
   bool hasLDG() const { return SmVersion >= 32; }
   bool hasHWROT32() const { return SmVersion >= 32; }
@@ -105,6 +106,7 @@ public:
   // Tcgen05 instructions in Blackwell family
   bool hasTcgen05Instructions() const {
     bool HasTcgen05 = false;
+    unsigned MinPTXVersion = 86;
     switch (FullSmVersion) {
     default:
       break;
@@ -112,9 +114,13 @@ public:
     case 1013: // sm_101a
       HasTcgen05 = true;
       break;
+    case 1033: // sm_103a
+      HasTcgen05 = true;
+      MinPTXVersion = 88;
+      break;
     }
 
-    return HasTcgen05 && PTXVersion >= 86;
+    return HasTcgen05 && PTXVersion >= MinPTXVersion;
   }
   // f32x2 instructions in Blackwell family
   bool hasF32x2Instructions() const;
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 0603994606d7..833f014a4c87 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -126,12 +126,12 @@ static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) {
   // (addrspace:3).
   if (!is64Bit)
     Ret += "-p:32:32-p6:32:32-p7:32:32";
-  else if (UseShortPointers) {
+  else if (UseShortPointers)
     Ret += "-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32";
-  } else
+  else
     Ret += "-p6:32:32";
 
-  Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64";
+  Ret += "-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64";
 
   return Ret;
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
index 274b04fdd30b..8e97b422218f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -55,15 +55,6 @@ void clearAnnotationCache(const Module *Mod) {
   AC.Cache.erase(Mod);
 }
 
-static void readIntVecFromMDNode(const MDNode *MetadataNode,
-                                 std::vector<unsigned> &Vec) {
-  for (unsigned i = 0, e = MetadataNode->getNumOperands(); i != e; ++i) {
-    ConstantInt *Val =
-        mdconst::extract<ConstantInt>(MetadataNode->getOperand(i));
-    Vec.push_back(Val->getZExtValue());
-  }
-}
-
 static void cacheAnnotationFromMD(const MDNode *MetadataNode,
                                   key_val_pair_t &retval) {
   auto &AC = getAnnotationCache();
@@ -83,19 +74,8 @@ static void cacheAnnotationFromMD(const MDNode *MetadataNode,
     if (ConstantInt *Val = mdconst::dyn_extract<ConstantInt>(
             MetadataNode->getOperand(i + 1))) {
       retval[Key].push_back(Val->getZExtValue());
-    } else if (MDNode *VecMd =
-                   dyn_cast<MDNode>(MetadataNode->getOperand(i + 1))) {
-      // note: only "grid_constant" annotations support vector MDNodes.
-      // assert: there can only exist one unique key value pair of
-      // the form (string key, MDNode node). Operands of such a node
-      // shall always be unsigned ints.
-      auto [It, Inserted] = retval.try_emplace(Key);
-      if (Inserted) {
-        readIntVecFromMDNode(VecMd, It->second);
-        continue;
-      }
     } else {
-      llvm_unreachable("Value operand not a constant int or an mdnode");
+      llvm_unreachable("Value operand not a constant int");
     }
   }
 }
@@ -179,16 +159,13 @@ static bool globalHasNVVMAnnotation(const Value &V, const std::string &Prop) {
 }
 
 static bool argHasNVVMAnnotation(const Value &Val,
-                                 const std::string &Annotation,
-                                 const bool StartArgIndexAtOne = false) {
+                                 const std::string &Annotation) {
   if (const Argument *Arg = dyn_cast<Argument>(&Val)) {
     const Function *Func = Arg->getParent();
     std::vector<unsigned> Annot;
     if (findAllNVVMAnnotation(Func, Annotation, Annot)) {
-      const unsigned BaseOffset = StartArgIndexAtOne ? 1 : 0;
-      if (is_contained(Annot, BaseOffset + Arg->getArgNo())) {
+      if (is_contained(Annot, Arg->getArgNo()))
         return true;
-      }
     }
   }
   return false;
@@ -250,8 +227,7 @@ bool isParamGridConstant(const Argument &Arg) {
   }
 
   // "grid_constant" counts argument indices starting from 1
-  if (argHasNVVMAnnotation(Arg, "grid_constant",
-                           /*StartArgIndexAtOne*/ true))
+  if (Arg.hasAttribute("nvvm.grid_constant"))
     return true;
 
   return false;
diff --git a/llvm/lib/Target/PowerPC/CMakeLists.txt b/llvm/lib/Target/PowerPC/CMakeLists.txt
index 1e39f01fd7aa..2182039e0eef 100644
--- a/llvm/lib/Target/PowerPC/CMakeLists.txt
+++ b/llvm/lib/Target/PowerPC/CMakeLists.txt
@@ -49,7 +49,7 @@ add_llvm_target(PowerPCCodeGen
   PPCTargetTransformInfo.cpp
   PPCTOCRegDeps.cpp
   PPCTLSDynamicCall.cpp
-  PPCVSXCopy.cpp
+  PPCVSXWACCCopy.cpp
   PPCReduceCRLogicals.cpp
   PPCVSXFMAMutate.cpp
   PPCVSXSwapRemoval.cpp
diff --git a/llvm/lib/Target/PowerPC/PPC.h b/llvm/lib/Target/PowerPC/PPC.h
index 124dac458431..a7cd5cde16b4 100644
--- a/llvm/lib/Target/PowerPC/PPC.h
+++ b/llvm/lib/Target/PowerPC/PPC.h
@@ -39,7 +39,7 @@ class ModulePass;
   FunctionPass *createPPCLoopInstrFormPrepPass(PPCTargetMachine &TM);
   FunctionPass *createPPCTOCRegDepsPass();
   FunctionPass *createPPCEarlyReturnPass();
-  FunctionPass *createPPCVSXCopyPass();
+  FunctionPass *createPPCVSXWACCCopyPass();
   FunctionPass *createPPCVSXFMAMutatePass();
   FunctionPass *createPPCVSXSwapRemovalPass();
   FunctionPass *createPPCReduceCRLogicalsPass();
@@ -64,7 +64,7 @@ class ModulePass;
   void initializePPCLoopInstrFormPrepPass(PassRegistry&);
   void initializePPCTOCRegDepsPass(PassRegistry&);
   void initializePPCEarlyReturnPass(PassRegistry&);
-  void initializePPCVSXCopyPass(PassRegistry&);
+  void initializePPCVSXWACCCopyPass(PassRegistry &);
   void initializePPCVSXFMAMutatePass(PassRegistry&);
   void initializePPCVSXSwapRemovalPass(PassRegistry&);
   void initializePPCReduceCRLogicalsPass(PassRegistry&);
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 2ab2c147be0e..023fd147535e 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -920,10 +920,6 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
   case TargetOpcode::PATCHABLE_FUNCTION_ENTER: {
     assert(!Subtarget->isAIXABI() &&
            "AIX does not support patchable function entry!");
-    // PATCHABLE_FUNCTION_ENTER on little endian is for XRAY support which is
-    // handled in PPCLinuxAsmPrinter.
-    if (MAI->isLittleEndian())
-      return;
     const Function &F = MF->getFunction();
     unsigned Num = 0;
     (void)F.getFnAttribute("patchable-function-entry")
@@ -1789,7 +1785,13 @@ void PPCLinuxAsmPrinter::emitInstruction(const MachineInstr *MI) {
     // Update compiler-rt/lib/xray/xray_powerpc64.cc accordingly when number
     // of instructions change.
     // XRAY is only supported on PPC Linux little endian.
-    if (!MAI->isLittleEndian())
+    const Function &F = MF->getFunction();
+    unsigned Num = 0;
+    (void)F.getFnAttribute("patchable-function-entry")
+        .getValueAsString()
+        .getAsInteger(10, Num);
+
+    if (!MAI->isLittleEndian() || Num)
       break;
     MCSymbol *BeginOfSled = OutContext.createTempSymbol();
     MCSymbol *EndOfSled = OutContext.createTempSymbol();
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 7022e9e9dae9..fa104e4f69d7 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1693,6 +1693,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::XXPERM:
     return "PPCISD::XXPERM";
   case PPCISD::VECSHL:          return "PPCISD::VECSHL";
+  case PPCISD::VSRQ:
+    return "PPCISD::VSRQ";
   case PPCISD::CMPB:            return "PPCISD::CMPB";
   case PPCISD::Hi:              return "PPCISD::Hi";
   case PPCISD::Lo:              return "PPCISD::Lo";
@@ -2696,7 +2698,7 @@ bool llvm::isIntS34Immediate(SDNode *N, int64_t &Imm) {
   if (!isa<ConstantSDNode>(N))
     return false;
 
-  Imm = (int64_t)cast<ConstantSDNode>(N)->getSExtValue();
+  Imm = cast<ConstantSDNode>(N)->getSExtValue();
   return isInt<34>(Imm);
 }
 bool llvm::isIntS34Immediate(SDValue Op, int64_t &Imm) {
@@ -11274,6 +11276,24 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getMergeValues(RetOps, dl);
   }
 
+  case Intrinsic::ppc_mma_build_dmr: {
+    SmallVector<SDValue, 8> Pairs;
+    SmallVector<SDValue, 8> Chains;
+    for (int i = 1; i < 9; i += 2) {
+      SDValue Hi = Op.getOperand(i);
+      SDValue Lo = Op.getOperand(i + 1);
+      if (Hi->getOpcode() == ISD::LOAD)
+        Chains.push_back(Hi.getValue(1));
+      if (Lo->getOpcode() == ISD::LOAD)
+        Chains.push_back(Lo.getValue(1));
+      Pairs.push_back(
+          DAG.getNode(PPCISD::PAIR_BUILD, dl, MVT::v256i1, {Hi, Lo}));
+    }
+    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+    SDValue Value = DMFInsert1024(Pairs, SDLoc(Op), DAG);
+    return DAG.getMergeValues({Value, TF}, dl);
+  }
+
   case Intrinsic::ppc_mma_dmxxextfdmr512: {
     assert(Subtarget.isISAFuture() && "dmxxextfdmr512 requires ISA Future");
     auto *Idx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
@@ -11610,6 +11630,10 @@ SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
             Op.getOperand(0)),
         0);
   }
+  case Intrinsic::ppc_mma_disassemble_dmr: {
+    return DAG.getStore(DAG.getEntryNode(), DL, Op.getOperand(ArgStart + 2),
+                        Op.getOperand(ArgStart + 1), MachinePointerInfo());
+  }
   default:
     break;
   }
@@ -12099,6 +12123,24 @@ SDValue PPCTargetLowering::LowerDMFVectorLoad(SDValue Op,
   return DAG.getMergeValues({DmrPValue, TF}, dl);
 }
 
+SDValue PPCTargetLowering::DMFInsert1024(const SmallVectorImpl<SDValue> &Pairs,
+                                         const SDLoc &dl,
+                                         SelectionDAG &DAG) const {
+  SDValue Lo(DAG.getMachineNode(PPC::DMXXINSTDMR512, dl, MVT::v512i1, Pairs[0],
+                                Pairs[1]),
+             0);
+  SDValue LoSub = DAG.getTargetConstant(PPC::sub_wacc_lo, dl, MVT::i32);
+  SDValue Hi(DAG.getMachineNode(PPC::DMXXINSTDMR512_HI, dl, MVT::v512i1,
+                                Pairs[2], Pairs[3]),
+             0);
+  SDValue HiSub = DAG.getTargetConstant(PPC::sub_wacc_hi, dl, MVT::i32);
+  SDValue RC = DAG.getTargetConstant(PPC::DMRRCRegClassID, dl, MVT::i32);
+
+  return SDValue(DAG.getMachineNode(PPC::REG_SEQUENCE, dl, MVT::v1024i1,
+                                    {RC, Lo, LoSub, Hi, HiSub}),
+                 0);
+}
+
 SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
                                            SelectionDAG &DAG) const {
   SDLoc dl(Op);
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 559d58309692..669430550f4e 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -498,6 +498,9 @@ namespace llvm {
     /// SETBCR - The ISA 3.1 (P10) SETBCR instruction.
     SETBCR,
 
+    /// VSRQ - The ISA 3.1 (P10) Vector Shift right quadword instruction
+    VSRQ,
+
     // NOTE: The nodes below may require PC-Rel specific patterns if the
     // address could be PC-Relative. When adding new nodes below, consider
     // whether or not the address can be PC-Relative and add the corresponding
@@ -1345,6 +1348,8 @@ namespace llvm {
     SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerDMFVectorLoad(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerDMFVectorStore(SDValue Op, SelectionDAG &DAG) const;
+    SDValue DMFInsert1024(const SmallVectorImpl<SDValue> &Pairs,
+                          const SDLoc &dl, SelectionDAG &DAG) const;
 
     SDValue LowerCallResult(SDValue Chain, SDValue InGlue,
                             CallingConv::ID CallConv, bool isVarArg,
diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
index fd2084398c85..269d30318bca 100644
--- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -1095,8 +1095,7 @@ let hasSideEffects = 0 in {
 defm RLDIMI : MDForm_1r<30, 3, (outs g8rc:$RA),
                         (ins g8rc:$RAi, g8rc:$RS, u6imm:$SH, u6imm:$MBE),
                         "rldimi", "$RA, $RS, $SH, $MBE", IIC_IntRotateDI,
-                        []>, isPPC64, RegConstraint<"$RAi = $RA">,
-                        NoEncode<"$RAi">;
+                        []>, isPPC64, RegConstraint<"$RAi = $RA">;
 
 // Rotate instructions.
 defm RLDCL  : MDSForm_1r<30, 8,
@@ -1156,7 +1155,7 @@ defm RLWIMI8 : MForm_2r<20, (outs g8rc:$RA),
                         (ins g8rc:$RAi, g8rc:$RS, u5imm:$SH, u5imm:$MB,
                         u5imm:$ME), "rlwimi", "$RA, $RS, $SH, $MB, $ME",
                         IIC_IntRotate, []>, PPC970_DGroup_Cracked,
-                        RegConstraint<"$RAi = $RA">, NoEncode<"$RAi">;
+                        RegConstraint<"$RAi = $RA">;
 
 let isSelect = 1 in
 def ISEL8   : AForm_4<31, 15,
@@ -1313,21 +1312,18 @@ let Interpretation64Bit = 1, isCodeGenOnly = 1 in
 def LHAU8 : DForm_1<43, (outs g8rc:$RST, ptr_rc_nor0:$ea_result),
                     (ins (memri $D, $RA):$addr),
                     "lhau $RST, $addr", IIC_LdStLHAU,
-                    []>, RegConstraint<"$addr.reg = $ea_result">,
-                    NoEncode<"$ea_result">;
+                    []>, RegConstraint<"$addr.reg = $ea_result">;
 // NO LWAU!
 
 let Interpretation64Bit = 1, isCodeGenOnly = 1 in
 def LHAUX8 : XForm_1_memOp<31, 375, (outs g8rc:$RST, ptr_rc_nor0:$ea_result),
                           (ins (memrr $RA, $RB):$addr),
                           "lhaux $RST, $addr", IIC_LdStLHAUX,
-                          []>, RegConstraint<"$addr.ptrreg = $ea_result">,
-                          NoEncode<"$ea_result">;
+                          []>, RegConstraint<"$addr.ptrreg = $ea_result">;
 def LWAUX : XForm_1_memOp<31, 373, (outs g8rc:$RST, ptr_rc_nor0:$ea_result),
                           (ins (memrr $RA, $RB):$addr),
                           "lwaux $RST, $addr", IIC_LdStLHAUX,
-                          []>, RegConstraint<"$addr.ptrreg = $ea_result">,
-                          NoEncode<"$ea_result">, isPPC64;
+                          []>, RegConstraint<"$addr.ptrreg = $ea_result">, isPPC64;
 }
 }
 
@@ -1366,34 +1362,28 @@ let mayLoad = 1, hasSideEffects = 0 in {
 def LBZU8 : DForm_1<35, (outs g8rc:$RST, ptr_rc_nor0:$ea_result),
                     (ins (memri $D, $RA):$addr),
                     "lbzu $RST, $addr", IIC_LdStLoadUpd,
-                    []>, RegConstraint<"$addr.reg = $ea_result">,
-                    NoEncode<"$ea_result">;
+                    []>, RegConstraint<"$addr.reg = $ea_result">;
 def LHZU8 : DForm_1<41, (outs g8rc:$RST, ptr_rc_nor0:$ea_result),
                     (ins (memri $D, $RA):$addr),
                     "lhzu $RST, $addr", IIC_LdStLoadUpd,
-                    []>, RegConstraint<"$addr.reg = $ea_result">,
-                    NoEncode<"$ea_result">;
+                    []>, RegConstraint<"$addr.reg = $ea_result">;
 def LWZU8 : DForm_1<33, (outs g8rc:$RST, ptr_rc_nor0:$ea_result),
                     (ins (memri $D, $RA):$addr),
                     "lwzu $RST, $addr", IIC_LdStLoadUpd,
-                    []>, RegConstraint<"$addr.reg = $ea_result">,
-                    NoEncode<"$ea_result">;
+                    []>, RegConstraint<"$addr.reg = $ea_result">;
 
 def LBZUX8 : XForm_1_memOp<31, 119, (outs g8rc:$RST, ptr_rc_nor0:$ea_result),
                           (ins (memrr $RA, $RB):$addr),
                           "lbzux $RST, $addr", IIC_LdStLoadUpdX,
-                          []>, RegConstraint<"$addr.ptrreg = $ea_result">,
-                          NoEncode<"$ea_result">;
+                          []>, RegConstraint<"$addr.ptrreg = $ea_result">;
 def LHZUX8 : XForm_1_memOp<31, 311, (outs g8rc:$RST, ptr_rc_nor0:$ea_result),
                           (ins (memrr $RA, $RB):$addr),
                           "lhzux $RST, $addr", IIC_LdStLoadUpdX,
-                          []>, RegConstraint<"$addr.ptrreg = $ea_result">,
-                          NoEncode<"$ea_result">;
+                          []>, RegConstraint<"$addr.ptrreg = $ea_result">;
 def LWZUX8 : XForm_1_memOp<31, 55, (outs g8rc:$RST, ptr_rc_nor0:$ea_result),
                           (ins (memrr $RA, $RB):$addr),
                           "lwzux $RST, $addr", IIC_LdStLoadUpdX,
-                          []>, RegConstraint<"$addr.ptrreg = $ea_result">,
-                          NoEncode<"$ea_result">;
+                          []>, RegConstraint<"$addr.ptrreg = $ea_result">;
 }
 }
 } // Interpretation64Bit
@@ -1445,14 +1435,12 @@ let mayLoad = 1, hasSideEffects = 0 in {
 def LDU  : DSForm_1<58, 1, (outs g8rc:$RST, ptr_rc_nor0:$ea_result),
                     (ins (memrix $D, $RA):$addr),
                     "ldu $RST, $addr", IIC_LdStLDU,
-                    []>, RegConstraint<"$addr.reg = $ea_result">, isPPC64,
-                    NoEncode<"$ea_result">;
+                    []>, RegConstraint<"$addr.reg = $ea_result">, isPPC64;
 
 def LDUX : XForm_1_memOp<31, 53, (outs g8rc:$RST, ptr_rc_nor0:$ea_result),
                         (ins (memrr $RA, $RB):$addr),
                         "ldux $RST, $addr", IIC_LdStLDUX,
-                        []>, RegConstraint<"$addr.ptrreg = $ea_result">,
-                        NoEncode<"$ea_result">, isPPC64;
+                        []>, RegConstraint<"$addr.ptrreg = $ea_result">, isPPC64;
 }
 
 let mayLoad = 1, hasNoSchedulingInfo = 1 in {
@@ -1718,45 +1706,41 @@ let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in {
 let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
 def STBU8 : DForm_1<39, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$RST, (memri $D, $RA):$addr),
                    "stbu $RST, $addr", IIC_LdStSTU, []>,
-                   RegConstraint<"$addr.reg = $ea_res">, NoEncode<"$ea_res">;
+                   RegConstraint<"$addr.reg = $ea_res">;
 def STHU8 : DForm_1<45, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$RST, (memri $D, $RA):$addr),
                    "sthu $RST, $addr", IIC_LdStSTU, []>,
-                   RegConstraint<"$addr.reg = $ea_res">, NoEncode<"$ea_res">;
+                   RegConstraint<"$addr.reg = $ea_res">;
 def STWU8 : DForm_1<37, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$RST, (memri $D, $RA):$addr),
                    "stwu $RST, $addr", IIC_LdStSTU, []>,
-                   RegConstraint<"$addr.reg = $ea_res">, NoEncode<"$ea_res">;
+                   RegConstraint<"$addr.reg = $ea_res">;
 
 def STBUX8: XForm_8_memOp<31, 247, (outs ptr_rc_nor0:$ea_res),
                           (ins g8rc:$RST, (memrr $RA, $RB):$addr),
                           "stbux $RST, $addr", IIC_LdStSTUX, []>,
                           RegConstraint<"$addr.ptrreg = $ea_res">,
-                          NoEncode<"$ea_res">,
                           PPC970_DGroup_Cracked;
 def STHUX8: XForm_8_memOp<31, 439, (outs ptr_rc_nor0:$ea_res),
                           (ins g8rc:$RST, (memrr $RA, $RB):$addr),
                           "sthux $RST, $addr", IIC_LdStSTUX, []>,
                           RegConstraint<"$addr.ptrreg = $ea_res">,
-                          NoEncode<"$ea_res">,
                           PPC970_DGroup_Cracked;
 def STWUX8: XForm_8_memOp<31, 183, (outs ptr_rc_nor0:$ea_res),
                           (ins g8rc:$RST, (memrr $RA, $RB):$addr),
                           "stwux $RST, $addr", IIC_LdStSTUX, []>,
                           RegConstraint<"$addr.ptrreg = $ea_res">,
-                          NoEncode<"$ea_res">,
                           PPC970_DGroup_Cracked;
 } // Interpretation64Bit
 
 def STDU : DSForm_1<62, 1, (outs ptr_rc_nor0:$ea_res),
                    (ins g8rc:$RST, (memrix $D, $RA):$addr),
                    "stdu $RST, $addr", IIC_LdStSTU, []>,
-                   RegConstraint<"$addr.reg = $ea_res">, NoEncode<"$ea_res">,
+                   RegConstraint<"$addr.reg = $ea_res">,
                    isPPC64;
 
 def STDUX : XForm_8_memOp<31, 181, (outs ptr_rc_nor0:$ea_res),
                           (ins g8rc:$RST, (memrr $RA, $RB):$addr),
                           "stdux $RST, $addr", IIC_LdStSTUX, []>,
                           RegConstraint<"$addr.ptrreg = $ea_res">,
-                          NoEncode<"$ea_res">,
                           PPC970_DGroup_Cracked, isPPC64;
 }
 
@@ -2000,7 +1984,7 @@ def : Pat<(int_ppc_darnraw), (DARN 2)>;
 
 class X_RA5_RB5<bits<6> opcode, bits<10> xo, string opc, RegisterOperand ty,
                    InstrItinClass itin, list<dag> pattern>
-  : X_L1_RS5_RS5<opcode, xo, (outs), (ins ty:$RA, ty:$RB, u1imm:$L),
+  : X_L1_RS5_RS5<opcode, xo, (outs), (ins ty:$RA, ty:$RB),
                  !strconcat(opc, " $RA, $RB"), itin, pattern>{
    let L = 1;
 }
diff --git a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
index 79fe12e8e4b4..97d5e2896323 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -261,6 +261,13 @@ def immEQOneV : PatLeaf<(build_vector), [{
     return C->isOne();
   return false;
 }]>;
+
+def VSRVSRO : PatFrag<(ops node:$input, node:$shift), 
+                      (int_ppc_altivec_vsr 
+                        (int_ppc_altivec_vsro node:$input, node:$shift), 
+                        node:$shift), 
+                      [{ return N->getOperand(1).hasOneUse(); }]>;
+
 //===----------------------------------------------------------------------===//
 // Helpers for defining instructions that directly correspond to intrinsics.
 
@@ -1471,13 +1478,13 @@ def VINSERTB : VXForm_1<781, (outs vrrc:$VD),
                         "vinsertb $VD, $VB, $VA", IIC_VecGeneral,
                         [(set v16i8:$VD, (PPCvecinsert v16i8:$VDi, v16i8:$VB,
                                                       imm32SExt16:$VA))]>,
-                        RegConstraint<"$VDi = $VD">, NoEncode<"$VDi">;
+                        RegConstraint<"$VDi = $VD">;
 def VINSERTH : VXForm_1<845, (outs vrrc:$VD),
                         (ins vrrc:$VDi, u4imm:$VA, vrrc:$VB),
                         "vinserth $VD, $VB, $VA", IIC_VecGeneral,
                         [(set v8i16:$VD, (PPCvecinsert v8i16:$VDi, v8i16:$VB,
                                                       imm32SExt16:$VA))]>,
-                        RegConstraint<"$VDi = $VD">, NoEncode<"$VDi">;
+                        RegConstraint<"$VDi = $VD">;
 def VINSERTW : VX1_VT5_UIM5_VB5<909, "vinsertw", []>;
 def VINSERTD : VX1_VT5_UIM5_VB5<973, "vinsertd", []>;
 
@@ -1569,7 +1576,7 @@ def VRLWMI : VXForm_1<133, (outs vrrc:$VD), (ins vrrc:$VA, vrrc:$VB, vrrc:$VDi),
                       [(set v4i32:$VD,
                          (int_ppc_altivec_vrlwmi v4i32:$VA, v4i32:$VB,
                                                  v4i32:$VDi))]>,
-                      RegConstraint<"$VDi = $VD">, NoEncode<"$VDi">;
+                      RegConstraint<"$VDi = $VD">;
 def VRLDNM : VX1_VT5_VA5_VB5<453, "vrldnm",
                              [(set v2i64:$VD,
                                  (int_ppc_altivec_vrldnm v2i64:$VA,
@@ -1579,7 +1586,7 @@ def VRLDMI : VXForm_1<197, (outs vrrc:$VD), (ins vrrc:$VA, vrrc:$VB, vrrc:$VDi),
                       [(set v2i64:$VD,
                          (int_ppc_altivec_vrldmi v2i64:$VA, v2i64:$VB,
                                                  v2i64:$VDi))]>,
-                      RegConstraint<"$VDi = $VD">, NoEncode<"$VDi">;
+                      RegConstraint<"$VDi = $VD">;
 
 // Vector Shift Left/Right
 def VSLV : VX1_VT5_VA5_VB5<1860, "vslv",
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFormats.td b/llvm/lib/Target/PowerPC/PPCInstrFormats.td
index b4b475b470a5..fba1c6609dba 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFormats.td
@@ -18,7 +18,7 @@ class I<bits<6> opcode, dag OOL, dag IOL, string asmstr, InstrItinClass itin>
   bit PPC64 = 0;  // Default value, override with isPPC64
 
   let Namespace = "PPC";
-  let Inst{0-5} = opcode;
+  let Inst{0...5} = opcode;
   let OutOperandList = OOL;
   let InOperandList = IOL;
   let AsmString = asmstr;
@@ -34,7 +34,7 @@ class I<bits<6> opcode, dag OOL, dag IOL, string asmstr, InstrItinClass itin>
   let TSFlags{0}   = PPC970_First;
   let TSFlags{1}   = PPC970_Single;
   let TSFlags{2}   = PPC970_Cracked;
-  let TSFlags{5-3} = PPC970_Unit;
+  let TSFlags{5...3} = PPC970_Unit;
 
   // Indicate that this instruction is of type X-Form Load or Store
   bits<1> XFormMemOp = 0;
@@ -99,8 +99,8 @@ class I2<bits<6> opcode1, bits<6> opcode2, dag OOL, dag IOL, string asmstr,
   bit PPC64 = 0;  // Default value, override with isPPC64
 
   let Namespace = "PPC";
-  let Inst{0-5} = opcode1;
-  let Inst{32-37} = opcode2;
+  let Inst{0...5} = opcode1;
+  let Inst{32...37} = opcode2;
   let OutOperandList = OOL;
   let InOperandList = IOL;
   let AsmString = asmstr;
@@ -116,7 +116,7 @@ class I2<bits<6> opcode1, bits<6> opcode2, dag OOL, dag IOL, string asmstr,
   let TSFlags{0}   = PPC970_First;
   let TSFlags{1}   = PPC970_Single;
   let TSFlags{2}   = PPC970_Cracked;
-  let TSFlags{5-3} = PPC970_Unit;
+  let TSFlags{5...3} = PPC970_Unit;
 
   // Fields used for relation models.
   string BaseName = "";
@@ -135,7 +135,7 @@ class IForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr,
   let Pattern = pattern;
   bits<24> LI;
 
-  let Inst{6-29}  = LI;
+  let Inst{6...29}  = LI;
   let Inst{30}    = aa;
   let Inst{31}    = lk;
 }
@@ -148,12 +148,12 @@ class BForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr>
   bits<14> BD;
 
   bits<5> BI;
-  let BI{0-1} = BIBO{5-6};
-  let BI{2-4} = CR{0-2};
+  let BI{0...1} = BIBO{5...6};
+  let BI{2...4} = CR{0...2};
 
-  let Inst{6-10}  = BIBO{4-0};
-  let Inst{11-15} = BI;
-  let Inst{16-29} = BD;
+  let Inst{6...10}  = BIBO{4...0};
+  let Inst{11...15} = BI;
+  let Inst{16...29} = BD;
   let Inst{30}    = aa;
   let Inst{31}    = lk;
 }
@@ -161,8 +161,8 @@ class BForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr>
 class BForm_1<bits<6> opcode, bits<5> bo, bit aa, bit lk, dag OOL, dag IOL,
              string asmstr>
   : BForm<opcode, aa, lk, OOL, IOL, asmstr> {
-  let BIBO{4-0} = bo;
-  let BIBO{6-5} = 0;
+  let BIBO{4...0} = bo;
+  let BIBO{6...5} = 0;
   let CR = 0;
 }
 
@@ -171,9 +171,9 @@ class BForm_2<bits<6> opcode, bits<5> bo, bits<5> bi, bit aa, bit lk,
   : I<opcode, OOL, IOL, asmstr, IIC_BrB> {
   bits<14> BD;
 
-  let Inst{6-10}  = bo;
-  let Inst{11-15} = bi;
-  let Inst{16-29} = BD;
+  let Inst{6...10}  = bo;
+  let Inst{11...15} = bi;
+  let Inst{16...29} = BD;
   let Inst{30}    = aa;
   let Inst{31}    = lk;
 }
@@ -185,9 +185,9 @@ class BForm_3<bits<6> opcode, bit aa, bit lk,
   bits<5> BI;
   bits<14> BD;
 
-  let Inst{6-10}  = BO;
-  let Inst{11-15} = BI;
-  let Inst{16-29} = BD;
+  let Inst{6...10}  = BO;
+  let Inst{11...15} = BI;
+  let Inst{16...29} = BD;
   let Inst{30}    = aa;
   let Inst{31}    = lk;
 }
@@ -200,10 +200,10 @@ class BForm_3_at<bits<6> opcode, bit aa, bit lk,
   bits<5> BI;
   bits<14> BD;
 
-  let Inst{6-8}   = BO{4-2};
-  let Inst{9-10}  = at;
-  let Inst{11-15} = BI;
-  let Inst{16-29} = BD;
+  let Inst{6...8}   = BO{4...2};
+  let Inst{9...10}  = at;
+  let Inst{11...15} = BI;
+  let Inst{16...29} = BD;
   let Inst{30}    = aa;
   let Inst{31}    = lk;
 }
@@ -215,9 +215,9 @@ BForm_4<bits<6> opcode, bits<5> bo, bit aa, bit lk,
   bits<5> BI;
   bits<14> BD;
 
-  let Inst{6-10}  = bo;
-  let Inst{11-15} = BI;
-  let Inst{16-29} = BD;
+  let Inst{6...10}  = bo;
+  let Inst{11...15} = BI;
+  let Inst{16...29} = BD;
   let Inst{30}    = aa;
   let Inst{31}    = lk;
 }
@@ -231,7 +231,7 @@ class SCForm<bits<6> opcode, bits<1> xo1, bits<1> xo2,
 
   let Pattern = pattern;
 
-  let Inst{20-26} = LEV;
+  let Inst{20...26} = LEV;
   let Inst{30}    = xo1;
   let Inst{31}    = xo2;
 }
@@ -246,9 +246,9 @@ class DForm_base<bits<6> opcode, dag OOL, dag IOL, string asmstr,
 
   let Pattern = pattern;
 
-  let Inst{6-10}  = RST;
-  let Inst{11-15} = RA;
-  let Inst{16-31} = D;
+  let Inst{6...10}  = RST;
+  let Inst{11...15} = RA;
+  let Inst{16...31} = D;
 }
 
 class DForm_1<bits<6> opcode, dag OOL, dag IOL, string asmstr,
@@ -273,9 +273,9 @@ class DForm_2_r0<bits<6> opcode, dag OOL, dag IOL, string asmstr,
 
   let Pattern = pattern;
 
-  let Inst{6-10}  = RST;
-  let Inst{11-15} = 0;
-  let Inst{16-31} = D;
+  let Inst{6...10}  = RST;
+  let Inst{11...15} = 0;
+  let Inst{16...31} = D;
 }
 
 class DForm_4<bits<6> opcode, dag OOL, dag IOL, string asmstr,
@@ -287,9 +287,9 @@ class DForm_4<bits<6> opcode, dag OOL, dag IOL, string asmstr,
 
   let Pattern = pattern;
 
-  let Inst{6-10}  = RST;
-  let Inst{11-15} = RA;
-  let Inst{16-31} = D;
+  let Inst{6...10}  = RST;
+  let Inst{11...15} = RA;
+  let Inst{16...31} = D;
 }
 
 class DForm_4_zero<bits<6> opcode, dag OOL, dag IOL, string asmstr,
@@ -321,13 +321,13 @@ class IForm_and_DForm_1<bits<6> opcode1, bit aa, bit lk, bits<6> opcode2,
   let Pattern = pattern;
   bits<24> LI;
 
-  let Inst{6-29}  = LI;
+  let Inst{6...29}  = LI;
   let Inst{30}    = aa;
   let Inst{31}    = lk;
 
-  let Inst{38-42}  = RST;
-  let Inst{43-47} = RA;
-  let Inst{48-63} = D;
+  let Inst{38...42}  = RST;
+  let Inst{43...47} = RA;
+  let Inst{48...63} = D;
 }
 
 // This is used to emit BL8+NOP.
@@ -349,11 +349,11 @@ class DForm_5<bits<6> opcode, dag OOL, dag IOL, string asmstr,
   bits<5>  RA;
   bits<16> D;
 
-  let Inst{6-8}   = BF;
+  let Inst{6...8}   = BF;
   let Inst{9}     = 0;
   let Inst{10}    = L;
-  let Inst{11-15} = RA;
-  let Inst{16-31} = D;
+  let Inst{11...15} = RA;
+  let Inst{16...31} = D;
 }
 
 class DForm_5_ext<bits<6> opcode, dag OOL, dag IOL, string asmstr,
@@ -383,10 +383,10 @@ class DSForm_1<bits<6> opcode, bits<2> xo, dag OOL, dag IOL, string asmstr,
 
   let Pattern = pattern;
 
-  let Inst{6-10}  = RST;
-  let Inst{11-15} = RA;
-  let Inst{16-29} = D;
-  let Inst{30-31} = xo;
+  let Inst{6...10}  = RST;
+  let Inst{11...15} = RA;
+  let Inst{16...29} = D;
+  let Inst{30...31} = xo;
 }
 
 // ISA V3.0B 1.6.6 DX-Form
@@ -398,10 +398,10 @@ class DXForm<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
 
   let Pattern = pattern;
 
-  let Inst{6-10}  = RT;
-  let Inst{11-15} = D{5-1};  // d1
-  let Inst{16-25} = D{15-6}; // d0
-  let Inst{26-30} = xo;
+  let Inst{6...10}  = RT;
+  let Inst{11...15} = D{5...1};  // d1
+  let Inst{16...25} = D{15...6}; // d0
+  let Inst{26...30} = xo;
   let Inst{31}    = D{0};    // d2
 }
 
@@ -415,11 +415,11 @@ class DQ_RD6_RS5_DQ12<bits<6> opcode, bits<3> xo, dag OOL, dag IOL,
 
   let Pattern = pattern;
 
-  let Inst{6-10}  = XT{4-0};
-  let Inst{11-15} = RA;
-  let Inst{16-27} = DQ;
+  let Inst{6...10}  = XT{4...0};
+  let Inst{11...15} = RA;
+  let Inst{16...27} = DQ;
   let Inst{28}    = XT{5};
-  let Inst{29-31} = xo;
+  let Inst{29...31} = xo;
 }
 
 class DQForm_RTp5_RA17_MEM<bits<6> opcode, bits<4> xo, dag OOL, dag IOL,
@@ -431,10 +431,10 @@ class DQForm_RTp5_RA17_MEM<bits<6> opcode, bits<4> xo, dag OOL, dag IOL,
   bits<12> DQ;
   let Pattern = pattern;
 
-  let Inst{6-10} =  RTp{4-0};
-  let Inst{11-15} = RA;
-  let Inst{16-27} = DQ;
-  let Inst{28-31} = xo;
+  let Inst{6...10} =  RTp{4...0};
+  let Inst{11...15} = RA;
+  let Inst{16...27} = DQ;
+  let Inst{28...31} = xo;
 }
 
 // 1.7.6 X-Form
@@ -449,10 +449,10 @@ class XForm_base_r3xo<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asms
 
   bit RC = 0;    // set by isRecordForm
 
-  let Inst{6-10}  = RST;
-  let Inst{11-15} = RA;
-  let Inst{16-20} = RB;
-  let Inst{21-30} = xo;
+  let Inst{6...10}  = RST;
+  let Inst{11...15} = RA;
+  let Inst{16...20} = RB;
+  let Inst{21...30} = xo;
   let Inst{31}    = RC;
 }
 
@@ -475,7 +475,7 @@ class XForm_tlbilx<bits<10> xo, dag OOL, dag IOL, string asmstr,
 class XForm_attn<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
                  InstrItinClass itin>
   : I<opcode, OOL, IOL, asmstr, itin> {
-  let Inst{21-30} = xo;
+  let Inst{21...30} = xo;
 }
 
 // This is the same as XForm_base_r3xo, but the first two operands are swapped
@@ -490,10 +490,10 @@ class XForm_base_r3xo_swapped
 
   bit RC = 0;    // set by isRecordForm
 
-  let Inst{6-10}  = RST;
-  let Inst{11-15} = RA;
-  let Inst{16-20} = RB;
-  let Inst{21-30} = xo;
+  let Inst{6...10}  = RST;
+  let Inst{11...15} = RA;
+  let Inst{16...20} = RB;
+  let Inst{21...30} = xo;
   let Inst{31}    = RC;
 }
 
@@ -528,10 +528,10 @@ class XForm_tlbws<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
 
   let Pattern = pattern;
 
-  let Inst{6-10}  = RST;
-  let Inst{11-15} = RA;
+  let Inst{6...10}  = RST;
+  let Inst{11...15} = RA;
   let Inst{20}    = WS;
-  let Inst{21-30} = xo;
+  let Inst{21...30} = xo;
   let Inst{31}    = 0;
 }
 
@@ -570,12 +570,12 @@ class XForm_16<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   bits<5> RA;
   bits<5> RB;
 
-  let Inst{6-8}   = BF;
+  let Inst{6...8}   = BF;
   let Inst{9}     = 0;
   let Inst{10}    = L;
-  let Inst{11-15} = RA;
-  let Inst{16-20} = RB;
-  let Inst{21-30} = xo;
+  let Inst{11...15} = RA;
+  let Inst{16...20} = RB;
+  let Inst{21...30} = xo;
   let Inst{31}    = 0;
 }
 
@@ -587,10 +587,10 @@ class XForm_icbt<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   bits<5> RB;
 
   let Inst{6} = 0;
-  let Inst{7-10} = CT;
-  let Inst{11-15} = RA;
-  let Inst{16-20} = RB;
-  let Inst{21-30} = xo;
+  let Inst{7...10} = CT;
+  let Inst{11...15} = RA;
+  let Inst{16...20} = RB;
+  let Inst{21...30} = xo;
   let Inst{31} = 0;
 }
 
@@ -600,9 +600,9 @@ class XForm_sr<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   bits<5> RS;
   bits<4> SR;
 
-  let Inst{6-10} = RS;
-  let Inst{12-15} = SR;
-  let Inst{21-30} = xo;
+  let Inst{6...10} = RS;
+  let Inst{12...15} = SR;
+  let Inst{21...30} = xo;
 }
 
 class XForm_mbar<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
@@ -610,8 +610,8 @@ class XForm_mbar<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
          : I<opcode, OOL, IOL, asmstr, itin> {
   bits<5> MO;
 
-  let Inst{6-10} = MO;
-  let Inst{21-30} = xo;
+  let Inst{6...10} = MO;
+  let Inst{21...30} = xo;
 }
 
 class XForm_srin<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
@@ -620,9 +620,9 @@ class XForm_srin<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   bits<5> RS;
   bits<5> RB;
 
-  let Inst{6-10} = RS;
-  let Inst{16-20} = RB;
-  let Inst{21-30} = xo;
+  let Inst{6...10} = RS;
+  let Inst{16...20} = RB;
+  let Inst{21...30} = xo;
 }
 
 class XForm_mtmsr<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
@@ -631,9 +631,9 @@ class XForm_mtmsr<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   bits<5> RS;
   bits<1> L;
 
-  let Inst{6-10} = RS;
+  let Inst{6...10} = RS;
   let Inst{15} = L;
-  let Inst{21-30} = xo;
+  let Inst{21...30} = xo;
 }
 
 class XForm_16_ext<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
@@ -649,11 +649,11 @@ class XForm_17<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   bits<5> RA;
   bits<5> RB;
 
-  let Inst{6-8}   = BF;
-  let Inst{9-10}  = 0;
-  let Inst{11-15} = RA;
-  let Inst{16-20} = RB;
-  let Inst{21-30} = xo;
+  let Inst{6...8}   = BF;
+  let Inst{9...10}  = 0;
+  let Inst{11...15} = RA;
+  let Inst{16...20} = RB;
+  let Inst{21...30} = xo;
   let Inst{31}    = 0;
 }
 
@@ -673,10 +673,10 @@ class XForm_18<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
 
   let Pattern = pattern;
   
-  let Inst{6-10}  = FRT;
-  let Inst{11-15} = FRA;
-  let Inst{16-20} = FRB;
-  let Inst{21-30} = xo;
+  let Inst{6...10}  = FRT;
+  let Inst{11...15} = FRA;
+  let Inst{16...20} = FRB;
+  let Inst{21...30} = xo;
   let Inst{31}    = 0;
 }
 
@@ -696,11 +696,11 @@ class XForm_20<bits<6> opcode, bits<6> xo, dag OOL, dag IOL, string asmstr,
 
   let Pattern = pattern;
   
-  let Inst{6-10}  = FRT;
-  let Inst{11-15} = FRA;
-  let Inst{16-20} = FRB;
-  let Inst{21-24} = tttt;
-  let Inst{25-30} = xo;
+  let Inst{6...10}  = FRT;
+  let Inst{11...15} = FRA;
+  let Inst{16...20} = FRB;
+  let Inst{21...24} = tttt;
+  let Inst{25...30} = xo;
   let Inst{31}    = 0;
 }
 
@@ -708,10 +708,10 @@ class XForm_24<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
                InstrItinClass itin, list<dag> pattern> 
   : I<opcode, OOL, IOL, asmstr, itin> {
   let Pattern = pattern;
-  let Inst{6-10}  = 31;
-  let Inst{11-15} = 0;
-  let Inst{16-20} = 0;
-  let Inst{21-30} = xo;
+  let Inst{6...10}  = 31;
+  let Inst{11...15} = 0;
+  let Inst{16...20} = 0;
+  let Inst{21...30} = xo;
   let Inst{31}    = 0;
 }
 
@@ -721,11 +721,11 @@ class XForm_24_sync<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
   bits<2> L;
 
   let Pattern = pattern;
-  let Inst{6-8}   = 0;
-  let Inst{9-10}  = L;
-  let Inst{11-15} = 0;
-  let Inst{16-20} = 0;
-  let Inst{21-30} = xo;
+  let Inst{6...8}   = 0;
+  let Inst{9...10}  = L;
+  let Inst{11...15} = 0;
+  let Inst{16...20} = 0;
+  let Inst{21...30} = xo;
   let Inst{31}    = 0;
 }
 
@@ -736,12 +736,12 @@ class XForm_IMM2_IMM2<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
   bits<2> PL;
 
   let Pattern = pattern;
-  let Inst{6-8}   = 0;
-  let Inst{9-10}  = L;
-  let Inst{11-13} = 0;
-  let Inst{14-15} = PL;
-  let Inst{16-20} = 0;
-  let Inst{21-30} = xo;
+  let Inst{6...8}   = 0;
+  let Inst{9...10}  = L;
+  let Inst{11...13} = 0;
+  let Inst{14...15} = PL;
+  let Inst{16...20} = 0;
+  let Inst{21...30} = xo;
   let Inst{31}    = 0;
 }
 
@@ -752,12 +752,12 @@ class XForm_IMM3_IMM2<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
   bits<2> SC;
 
   let Pattern = pattern;
-  let Inst{6-7}   = 0;
-  let Inst{8-10}  = L;
-  let Inst{11-13} = 0;
-  let Inst{14-15} = SC;
-  let Inst{16-20} = 0;
-  let Inst{21-30} = xo;
+  let Inst{6...7}   = 0;
+  let Inst{8...10}  = L;
+  let Inst{11...13} = 0;
+  let Inst{14...15} = SC;
+  let Inst{16...20} = 0;
+  let Inst{21...30} = xo;
   let Inst{31}    = 0;
 }
 
@@ -803,9 +803,9 @@ class XForm_42<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
 
   bit RC = 0;    // set by isRecordForm
 
-  let Inst{6-10}  = RST;
-  let Inst{11-20} = 0;
-  let Inst{21-30} = xo;
+  let Inst{6...10}  = RST;
+  let Inst{11...20} = 0;
+  let Inst{21...30} = xo;
   let Inst{31}    = RC;
 }
 class XForm_43<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
@@ -816,9 +816,9 @@ class XForm_43<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
 
   bit RC = 0;    // set by isRecordForm
 
-  let Inst{6-10}  = FM;
-  let Inst{11-20} = 0;
-  let Inst{21-30} = xo;
+  let Inst{6...10}  = FM;
+  let Inst{11...20} = 0;
+  let Inst{21...30} = xo;
   let Inst{31}    = RC;
 }
 
@@ -828,11 +828,11 @@ class XForm_44<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   bits<5> RT;
   bits<3> BFA;
 
-  let Inst{6-10}  = RT;
-  let Inst{11-13} = BFA;
-  let Inst{14-15} = 0;
-  let Inst{16-20} = 0;
-  let Inst{21-30} = xo;
+  let Inst{6...10}  = RT;
+  let Inst{11...13} = BFA;
+  let Inst{14...15} = 0;
+  let Inst{16...20} = 0;
+  let Inst{21...30} = xo;
   let Inst{31}    = 0;
 }
 
@@ -842,11 +842,11 @@ class XForm_45<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   bits<5> RT;
   bits<2> L;
 
-  let Inst{6-10}  = RT;
-  let Inst{11-13} = 0;
-  let Inst{14-15} = L;
-  let Inst{16-20} = 0;
-  let Inst{21-30} = xo;
+  let Inst{6...10}  = RT;
+  let Inst{11...13} = 0;
+  let Inst{14...15} = L;
+  let Inst{16...20} = 0;
+  let Inst{21...30} = xo;
   let Inst{31}    = 0;
 }
 
@@ -856,11 +856,11 @@ class X_FRT5_XO2_XO3_XO10<bits<6> opcode, bits<2> xo1, bits<3> xo2, bits<10> xo,
   : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
   let Pattern = pattern;
 
-  let Inst{6-10}  = RST;
-  let Inst{11-12} = xo1;
-  let Inst{13-15} = xo2;
-  let Inst{16-20} = 0;
-  let Inst{21-30} = xo;
+  let Inst{6...10}  = RST;
+  let Inst{11...12} = xo1;
+  let Inst{13...15} = xo2;
+  let Inst{16...20} = 0;
+  let Inst{21...30} = xo;
   let Inst{31}    = 0;
 }
 
@@ -871,11 +871,11 @@ class X_FRT5_XO2_XO3_FRB5_XO10<bits<6> opcode, bits<2> xo1, bits<3> xo2,
   let Pattern = pattern;
   bits<5> FRB;
 
-  let Inst{6-10}  = RST;
-  let Inst{11-12} = xo1;
-  let Inst{13-15} = xo2;
-  let Inst{16-20} = FRB;
-  let Inst{21-30} = xo;
+  let Inst{6...10}  = RST;
+  let Inst{11...12} = xo1;
+  let Inst{13...15} = xo2;
+  let Inst{16...20} = FRB;
+  let Inst{21...30} = xo;
   let Inst{31}    = 0;
 }
 
@@ -886,12 +886,12 @@ class X_FRT5_XO2_XO3_DRM3_XO10<bits<6> opcode, bits<2> xo1, bits<3> xo2,
   let Pattern = pattern;
   bits<3> DRM;
 
-  let Inst{6-10}  = RST;
-  let Inst{11-12} = xo1;
-  let Inst{13-15} = xo2;
-  let Inst{16-17} = 0;
-  let Inst{18-20} = DRM;
-  let Inst{21-30} = xo;
+  let Inst{6...10}  = RST;
+  let Inst{11...12} = xo1;
+  let Inst{13...15} = xo2;
+  let Inst{16...17} = 0;
+  let Inst{18...20} = DRM;
+  let Inst{21...30} = xo;
   let Inst{31}    = 0;
 }
 
@@ -902,12 +902,12 @@ class X_FRT5_XO2_XO3_RM2_X10<bits<6> opcode, bits<2> xo1, bits<3> xo2,
   let Pattern = pattern;
   bits<2> RM;
 
-  let Inst{6-10}  = RST;
-  let Inst{11-12} = xo1;
-  let Inst{13-15} = xo2;
-  let Inst{16-18} = 0;
-  let Inst{19-20} = RM;
-  let Inst{21-30} = xo;
+  let Inst{6...10}  = RST;
+  let Inst{11...12} = xo1;
+  let Inst{13...15} = xo2;
+  let Inst{16...18} = 0;
+  let Inst{19...20} = RM;
+  let Inst{21...30} = xo;
   let Inst{31}    = 0;
 }
 
@@ -934,10 +934,10 @@ class XForm_htm0<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
 
   bit RC = 1;
 
-  let Inst{6-9}   = 0;
+  let Inst{6...9}   = 0;
   let Inst{10}    = R;
-  let Inst{11-20} = 0;
-  let Inst{21-30} = xo;
+  let Inst{11...20} = 0;
+  let Inst{21...30} = xo;
   let Inst{31}    = RC;
 }
 
@@ -949,8 +949,8 @@ class XForm_htm1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
   bit RC = 1;
 
   let Inst{6}     = A;
-  let Inst{7-20}  = 0;
-  let Inst{21-30} = xo;
+  let Inst{7...20}  = 0;
+  let Inst{21...30} = xo;
   let Inst{31}    = RC;
 }
 
@@ -961,10 +961,10 @@ class XForm_htm2<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
 
   bit RC = 0;    // set by isRecordForm
 
-  let Inst{7-9}   = 0;
+  let Inst{7...9}   = 0;
   let Inst{10}    = L;
-  let Inst{11-20} = 0;
-  let Inst{21-30} = xo;
+  let Inst{11...20} = 0;
+  let Inst{21...30} = xo;
   let Inst{31}    = RC;
 }
 
@@ -975,9 +975,9 @@ class XForm_htm3<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
 
   bit RC = 0;
 
-  let Inst{6-8}   = BF;
-  let Inst{9-20}  = 0;
-  let Inst{21-30} = xo;
+  let Inst{6...8}   = BF;
+  let Inst{9...20}  = 0;
+  let Inst{21...30} = xo;
   let Inst{31}    = RC;
 }
 
@@ -992,12 +992,12 @@ class X_BF3_L1_RS5_RS5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
 
   let Pattern = pattern;
 
-  let Inst{6-8}   = BF;
+  let Inst{6...8}   = BF;
   let Inst{9}     = 0;
   let Inst{10}    = L;
-  let Inst{11-15} = RA;
-  let Inst{16-20} = RB;
-  let Inst{21-30} = xo;
+  let Inst{11...15} = RA;
+  let Inst{16...20} = RB;
+  let Inst{21...30} = xo;
   let Inst{31}    = 0;
 }
 
@@ -1011,11 +1011,11 @@ class X_BF3_RS5_RS5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
 
   let Pattern = pattern;
 
-  let Inst{6-8}   = BF;
-  let Inst{9-10}  = 0;
-  let Inst{11-15} = RA;
-  let Inst{16-20} = RB;
-  let Inst{21-30} = xo;
+  let Inst{6...8}   = BF;
+  let Inst{9...10}  = 0;
+  let Inst{11...15} = RA;
+  let Inst{16...20} = RB;
+  let Inst{21...30} = xo;
   let Inst{31}    = 0;
 }
 
@@ -1035,10 +1035,10 @@ class X_BF3_DCMX7_RS5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
 
   let Pattern = pattern;
 
-  let Inst{6-8}  = BF;
-  let Inst{9-15} = DCMX;
-  let Inst{16-20} = VB;
-  let Inst{21-30} = xo;
+  let Inst{6...8}  = BF;
+  let Inst{9...15} = DCMX;
+  let Inst{16...20} = VB;
+  let Inst{21...30} = xo;
   let Inst{31}    = 0;
 }
 
@@ -1050,10 +1050,10 @@ class X_RD6_IMM8<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
 
   let Pattern = pattern;
 
-  let Inst{6-10}  = XT{4-0};
-  let Inst{11-12} = 0;
-  let Inst{13-20} = IMM8;
-  let Inst{21-30} = xo;
+  let Inst{6...10}  = XT{4...0};
+  let Inst{11...12} = 0;
+  let Inst{13...20} = IMM8;
+  let Inst{21...30} = xo;
   let Inst{31}    = XT{5};
 }
 
@@ -1092,10 +1092,10 @@ class XX1Form<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
 
   let Pattern = pattern;
 
-  let Inst{6-10}  = XT{4-0};
-  let Inst{11-15} = RA;
-  let Inst{16-20} = RB;
-  let Inst{21-30} = xo;
+  let Inst{6...10}  = XT{4...0};
+  let Inst{11...15} = RA;
+  let Inst{16...20} = RB;
+  let Inst{21...30} = xo;
   let Inst{31}    = XT{5};
 }
 
@@ -1117,10 +1117,10 @@ class XX2Form<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr,
 
   let Pattern = pattern;
 
-  let Inst{6-10}  = XT{4-0};
-  let Inst{11-15} = 0;
-  let Inst{16-20} = XB{4-0};
-  let Inst{21-29} = xo;
+  let Inst{6...10}  = XT{4...0};
+  let Inst{11...15} = 0;
+  let Inst{16...20} = XB{4...0};
+  let Inst{21...29} = xo;
   let Inst{30}    = XB{5};
   let Inst{31}    = XT{5};
 }
@@ -1133,10 +1133,10 @@ class XX2Form_1<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr,
 
   let Pattern = pattern;
 
-  let Inst{6-8}   = CR;
-  let Inst{9-15}  = 0;
-  let Inst{16-20} = XB{4-0};
-  let Inst{21-29} = xo;
+  let Inst{6...8}   = CR;
+  let Inst{9...15}  = 0;
+  let Inst{16...20} = XB{4...0};
+  let Inst{21...29} = xo;
   let Inst{30}    = XB{5};
   let Inst{31}    = 0;
 }
@@ -1150,11 +1150,11 @@ class XX2Form_2<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr,
 
   let Pattern = pattern;
 
-  let Inst{6-10}  = XT{4-0};
-  let Inst{11-13} = 0;
-  let Inst{14-15} = D;
-  let Inst{16-20} = XB{4-0};
-  let Inst{21-29} = xo;
+  let Inst{6...10}  = XT{4...0};
+  let Inst{11...13} = 0;
+  let Inst{14...15} = D;
+  let Inst{16...20} = XB{4...0};
+  let Inst{21...29} = xo;
   let Inst{30}    = XB{5};
   let Inst{31}    = XT{5};
 }
@@ -1168,10 +1168,10 @@ class XX2_RD6_UIM5_RS6<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
 
   let Pattern = pattern;
 
-  let Inst{6-10}  = XT{4-0};
-  let Inst{11-15} = UIM5;
-  let Inst{16-20} = XB{4-0};
-  let Inst{21-29} = xo;
+  let Inst{6...10}  = XT{4...0};
+  let Inst{11...15} = UIM5;
+  let Inst{16...20} = XB{4...0};
+  let Inst{21...29} = xo;
   let Inst{30}    = XB{5};
   let Inst{31}    = XT{5};
 }
@@ -1185,10 +1185,10 @@ class XX2_RD5_XO5_RS6<bits<6> opcode, bits<5> xo2, bits<9> xo, dag OOL, dag IOL,
 
   let Pattern = pattern;
 
-  let Inst{6-10}  = RT;
-  let Inst{11-15} = xo2;
-  let Inst{16-20} = XB{4-0};
-  let Inst{21-29} = xo;
+  let Inst{6...10}  = RT;
+  let Inst{11...15} = xo2;
+  let Inst{16...20} = XB{4...0};
+  let Inst{21...29} = xo;
   let Inst{30}    = XB{5};
   let Inst{31}    = 0;
 }
@@ -1202,10 +1202,10 @@ class XX2_RD6_XO5_RS6<bits<6> opcode, bits<5> xo2, bits<9> xo, dag OOL, dag IOL,
 
   let Pattern = pattern;
 
-  let Inst{6-10}  = XT{4-0};
-  let Inst{11-15} = xo2;
-  let Inst{16-20} = XB{4-0};
-  let Inst{21-29} = xo;
+  let Inst{6...10}  = XT{4...0};
+  let Inst{11...15} = xo2;
+  let Inst{16...20} = XB{4...0};
+  let Inst{21...29} = xo;
   let Inst{30}    = XB{5};
   let Inst{31}    = XT{5};
 }
@@ -1219,10 +1219,10 @@ class XX2_BF3_DCMX7_RS6<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
 
   let Pattern = pattern;
 
-  let Inst{6-8}  = BF;
-  let Inst{9-15} = DCMX;
-  let Inst{16-20} = XB{4-0};
-  let Inst{21-29} = xo;
+  let Inst{6...8}  = BF;
+  let Inst{9...15} = DCMX;
+  let Inst{16...20} = XB{4...0};
+  let Inst{21...29} = xo;
   let Inst{30}    = XB{5};
   let Inst{31}    = 0;
 }
@@ -1237,12 +1237,12 @@ class XX2_RD6_DCMX7_RS6<bits<6> opcode, bits<4> xo1, bits<3> xo2,
 
   let Pattern = pattern;
 
-  let Inst{6-10}  = XT{4-0};
-  let Inst{11-15} = DCMX{4-0};
-  let Inst{16-20} = XB{4-0};
-  let Inst{21-24} = xo1;
+  let Inst{6...10}  = XT{4...0};
+  let Inst{11...15} = DCMX{4...0};
+  let Inst{16...20} = XB{4...0};
+  let Inst{21...24} = xo1;
   let Inst{25}    = DCMX{6};
-  let Inst{26-28} = xo2;
+  let Inst{26...28} = xo2;
   let Inst{29}    = DCMX{5};
   let Inst{30}    = XB{5};
   let Inst{31}    = XT{5};
@@ -1257,10 +1257,10 @@ class XForm_XD6_RA5_RB5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
 
   let Pattern = pattern;
 
-  let Inst{6-10}  = D{4-0};  // D
-  let Inst{11-15} = RA;
-  let Inst{16-20} = RB;
-  let Inst{21-30} = xo;
+  let Inst{6...10}  = D{4...0};  // D
+  let Inst{11...15} = RA;
+  let Inst{16...20} = RB;
+  let Inst{21...30} = xo;
   let Inst{31}    = D{5};    // DX
 }
 
@@ -1273,11 +1273,11 @@ class XForm_BF3_UIM6_FRB5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
 
   let Pattern = pattern;
 
-  let Inst{6-8}   = BF;
+  let Inst{6...8}   = BF;
   let Inst{9}     = 0;
-  let Inst{10-15} = UIM;
-  let Inst{16-20} = FRB;
-  let Inst{21-30} = xo;
+  let Inst{10...15} = UIM;
+  let Inst{16...20} = FRB;
+  let Inst{21...30} = xo;
   let Inst{31}    = 0;
 }
 
@@ -1292,11 +1292,11 @@ class XForm_SP2_FRTB5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asms
 
   bit RC = 0; // set by isRecordForm
 
-  let Inst{6 - 10} = FRT;
-  let Inst{11 - 12} = SP;
-  let Inst{13 - 15} = 0;
-  let Inst{16 - 20} = FRB;
-  let Inst{21 - 30} = xo;
+  let Inst{6...10} = FRT;
+  let Inst{11...12} = SP;
+  let Inst{13...15} = 0;
+  let Inst{16...20} = FRB;
+  let Inst{21...30} = xo;
   let Inst{31} = RC;
 }
 
@@ -1311,11 +1311,11 @@ class XForm_S1_FRTB5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
 
   bit RC = 0; // set by isRecordForm
 
-  let Inst{6 - 10} = FRT;
+  let Inst{6...10} = FRT;
   let Inst{11} = S;
-  let Inst{12 - 15} = 0;
-  let Inst{16 - 20} = FRB;
-  let Inst{21 - 30} = xo;
+  let Inst{12...15} = 0;
+  let Inst{16...20} = FRB;
+  let Inst{21...30} = xo;
   let Inst{31} = RC;
 }
 
@@ -1328,10 +1328,10 @@ class XX3Form<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
 
   let Pattern = pattern;
 
-  let Inst{6-10}  = XT{4-0};
-  let Inst{11-15} = XA{4-0};
-  let Inst{16-20} = XB{4-0};
-  let Inst{21-28} = xo;
+  let Inst{6...10}  = XT{4...0};
+  let Inst{11...15} = XA{4...0};
+  let Inst{16...20} = XB{4...0};
+  let Inst{21...28} = xo;
   let Inst{29}    = XA{5};
   let Inst{30}    = XB{5};
   let Inst{31}    = XT{5};
@@ -1353,11 +1353,11 @@ class XX3Form_1<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
 
   let Pattern = pattern;
 
-  let Inst{6-8}   = CR;
-  let Inst{9-10}  = 0;
-  let Inst{11-15} = XA{4-0};
-  let Inst{16-20} = XB{4-0};
-  let Inst{21-28} = xo;
+  let Inst{6...8}   = CR;
+  let Inst{9...10}  = 0;
+  let Inst{11...15} = XA{4...0};
+  let Inst{16...20} = XB{4...0};
+  let Inst{21...28} = xo;
   let Inst{29}    = XA{5};
   let Inst{30}    = XB{5};
   let Inst{31}    = 0;
@@ -1373,12 +1373,12 @@ class XX3Form_2<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
 
   let Pattern = pattern;
 
-  let Inst{6-10}  = XT{4-0};
-  let Inst{11-15} = XA{4-0};
-  let Inst{16-20} = XB{4-0};
+  let Inst{6...10}  = XT{4...0};
+  let Inst{11...15} = XA{4...0};
+  let Inst{16...20} = XB{4...0};
   let Inst{21}    = 0;
-  let Inst{22-23} = D;
-  let Inst{24-28} = xo;
+  let Inst{22...23} = D;
+  let Inst{24...28} = xo;
   let Inst{29}    = XA{5};
   let Inst{30}    = XB{5};
   let Inst{31}    = XT{5};
@@ -1395,11 +1395,11 @@ class XX3Form_Rc<bits<6> opcode, bits<7> xo, dag OOL, dag IOL, string asmstr,
 
   bit RC = 0;    // set by isRecordForm
 
-  let Inst{6-10}  = XT{4-0};
-  let Inst{11-15} = XA{4-0};
-  let Inst{16-20} = XB{4-0};
+  let Inst{6...10}  = XT{4...0};
+  let Inst{11...15} = XA{4...0};
+  let Inst{16...20} = XB{4...0};
   let Inst{21}    = RC;
-  let Inst{22-28} = xo;
+  let Inst{22...28} = xo;
   let Inst{29}    = XA{5};
   let Inst{30}    = XB{5};
   let Inst{31}    = XT{5};
@@ -1415,11 +1415,11 @@ class XX4Form<bits<6> opcode, bits<2> xo, dag OOL, dag IOL, string asmstr,
 
   let Pattern = pattern;
 
-  let Inst{6-10}  = XT{4-0};
-  let Inst{11-15} = XA{4-0};
-  let Inst{16-20} = XB{4-0};
-  let Inst{21-25} = XC{4-0};
-  let Inst{26-27} = xo;
+  let Inst{6...10}  = XT{4...0};
+  let Inst{11...15} = XA{4...0};
+  let Inst{16...20} = XB{4...0};
+  let Inst{21...25} = XC{4...0};
+  let Inst{26...27} = xo;
   let Inst{28}    = XC{5};
   let Inst{29}    = XA{5};
   let Inst{30}    = XB{5};
@@ -1435,10 +1435,10 @@ class DCB_Form<bits<10> xo, bits<5> immfield, dag OOL, dag IOL, string asmstr,
 
   let Pattern = pattern;
 
-  let Inst{6-10}  = immfield;
-  let Inst{11-15} = RA;
-  let Inst{16-20} = RB;
-  let Inst{21-30} = xo;
+  let Inst{6...10}  = immfield;
+  let Inst{11...15} = RA;
+  let Inst{16...20} = RB;
+  let Inst{21...30} = xo;
   let Inst{31}    = 0;
 }
 
@@ -1451,10 +1451,10 @@ class DCB_Form_hint<bits<10> xo, dag OOL, dag IOL, string asmstr,
 
   let Pattern = pattern;
 
-  let Inst{6-10}  = TH;
-  let Inst{11-15} = RA;
-  let Inst{16-20} = RB;
-  let Inst{21-30} = xo;
+  let Inst{6...10}  = TH;
+  let Inst{11...15} = RA;
+  let Inst{16...20} = RB;
+  let Inst{21...30} = xo;
   let Inst{31}    = 0;
 }
 
@@ -1469,11 +1469,11 @@ class DSS_Form<bits<1> T, bits<10> xo, dag OOL, dag IOL, string asmstr,
   let Pattern = pattern;
 
   let Inst{6}     = T;
-  let Inst{7-8}   = 0;
-  let Inst{9-10}  = STRM;
-  let Inst{11-15} = RA;
-  let Inst{16-20} = RB;
-  let Inst{21-30} = xo;
+  let Inst{7...8}   = 0;
+  let Inst{9...10}  = STRM;
+  let Inst{11...15} = RA;
+  let Inst{16...20} = RB;
+  let Inst{21...30} = xo;
   let Inst{31}    = 0;
 }
 
@@ -1487,10 +1487,10 @@ class XLForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   
   let Pattern = pattern;
   
-  let Inst{6-10}  = CRD;
-  let Inst{11-15} = CRA;
-  let Inst{16-20} = CRB;
-  let Inst{21-30} = xo;
+  let Inst{6...10}  = CRD;
+  let Inst{11...15} = CRA;
+  let Inst{16...20} = CRB;
+  let Inst{21...30} = xo;
   let Inst{31}    = 0;
 }
 
@@ -1527,10 +1527,10 @@ class XLForm_1_ext<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   
   let Pattern = pattern;
   
-  let Inst{6-10}  = CRD;
-  let Inst{11-15} = CRD;
-  let Inst{16-20} = CRD;
-  let Inst{21-30} = xo;
+  let Inst{6...10}  = CRD;
+  let Inst{11...15} = CRD;
+  let Inst{16...20} = CRD;
+  let Inst{21...30} = xo;
   let Inst{31}    = 0;
 }
 
@@ -1543,11 +1543,11 @@ class XLForm_2<bits<6> opcode, bits<10> xo, bit lk, dag OOL, dag IOL, string asm
   
   let Pattern = pattern;
   
-  let Inst{6-10}  = BO;
-  let Inst{11-15} = BI;
-  let Inst{16-18} = 0;
-  let Inst{19-20} = BH;
-  let Inst{21-30} = xo;
+  let Inst{6...10}  = BO;
+  let Inst{11...15} = BI;
+  let Inst{16...18} = 0;
+  let Inst{19...20} = BH;
+  let Inst{21...30} = xo;
   let Inst{31}    = lk;
 }
 
@@ -1557,9 +1557,9 @@ class XLForm_2_br<bits<6> opcode, bits<10> xo, bit lk,
   bits<7> BIBO;  // 2 bits of BI and 5 bits of BO.
   bits<3>  CR;
   
-  let BO = BIBO{4-0};
-  let BI{0-1} = BIBO{5-6};
-  let BI{2-4} = CR{0-2};
+  let BO = BIBO{4...0};
+  let BI{0...1} = BIBO{5...6};
+  let BI{2...4} = CR{0...2};
   let BH = 0;
 }
 
@@ -1584,12 +1584,12 @@ class XLForm_3<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   bits<3> BF;
   bits<3> BFA;
   
-  let Inst{6-8}   = BF;
-  let Inst{9-10}  = 0;
-  let Inst{11-13} = BFA;
-  let Inst{14-15} = 0;
-  let Inst{16-20} = 0;
-  let Inst{21-30} = xo;
+  let Inst{6...8}   = BF;
+  let Inst{9...10}  = 0;
+  let Inst{11...13} = BFA;
+  let Inst{14...15} = 0;
+  let Inst{16...20} = 0;
+  let Inst{21...30} = xo;
   let Inst{31}    = 0;
 }
 
@@ -1602,13 +1602,13 @@ class XLForm_4<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   
   bit RC = 0;
   
-  let Inst{6-8}   = BF;
-  let Inst{9-10}  = 0;
-  let Inst{11-14} = 0;
+  let Inst{6...8}   = BF;
+  let Inst{9...10}  = 0;
+  let Inst{11...14} = 0;
   let Inst{15}    = W;
-  let Inst{16-19} = U;
+  let Inst{16...19} = U;
   let Inst{20}    = 0;
-  let Inst{21-30} = xo;
+  let Inst{21...30} = xo;
   let Inst{31}    = RC;
 }
 
@@ -1619,9 +1619,9 @@ class XLForm_S<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   
   let Pattern = pattern;
   
-  let Inst{6-19}  = 0;
+  let Inst{6...19}  = 0;
   let Inst{20}    = S;
-  let Inst{21-30} = xo;
+  let Inst{21...30} = xo;
   let Inst{31}    = 0;
 }
 
@@ -1640,17 +1640,17 @@ class XLForm_2_and_DSForm_1<bits<6> opcode1, bits<10> xo1, bit lk,
 
   let Pattern = pattern;
 
-  let Inst{6-10}  = BO;
-  let Inst{11-15} = BI;
-  let Inst{16-18} = 0;
-  let Inst{19-20} = BH;
-  let Inst{21-30} = xo1;
+  let Inst{6...10}  = BO;
+  let Inst{11...15} = BI;
+  let Inst{16...18} = 0;
+  let Inst{19...20} = BH;
+  let Inst{21...30} = xo1;
   let Inst{31}    = lk;
 
-  let Inst{38-42} = RST;
-  let Inst{43-47} = RA;
-  let Inst{48-61} = D;
-  let Inst{62-63} = xo2;
+  let Inst{38...42} = RST;
+  let Inst{43...47} = RA;
+  let Inst{48...61} = D;
+  let Inst{62...63} = xo2;
 }
 
 class XLForm_2_ext_and_DSForm_1<bits<6> opcode1, bits<10> xo1,
@@ -1677,16 +1677,16 @@ class XLForm_2_ext_and_DForm_1<bits<6> opcode1, bits<10> xo1, bits<5> bo,
 
   let Pattern = pattern;
 
-  let Inst{6-10} = bo;
-  let Inst{11-15} = bi;
-  let Inst{16-18} = 0;
-  let Inst{19-20} = 0;  // Unused (BH)
-  let Inst{21-30} = xo1;
+  let Inst{6...10} = bo;
+  let Inst{11...15} = bi;
+  let Inst{16...18} = 0;
+  let Inst{19...20} = 0;  // Unused (BH)
+  let Inst{21...30} = xo1;
   let Inst{31} = lk;
 
-  let Inst{38-42} = RST;
-  let Inst{43-47} = RA;
-  let Inst{48-63} = D;
+  let Inst{38...42} = RST;
+  let Inst{43...47} = RA;
+  let Inst{48...63} = D;
 }
 
 // 1.7.8 XFX-Form
@@ -1696,7 +1696,7 @@ class XFXForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   bits<5>  RST;
   bits<10> SPR;
 
-  let Inst{6-10}  = RST;
+  let Inst{6...10}  = RST;
   let Inst{11}    = SPR{4};
   let Inst{12}    = SPR{3};
   let Inst{13}    = SPR{2};
@@ -1707,7 +1707,7 @@ class XFXForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   let Inst{18}    = SPR{7};
   let Inst{19}    = SPR{6};
   let Inst{20}    = SPR{5};
-  let Inst{21-30} = xo;
+  let Inst{21...30} = xo;
   let Inst{31}    = 0;
 }
 
@@ -1722,9 +1722,9 @@ class XFXForm_3<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
          : I<opcode, OOL, IOL, asmstr, itin> {
   bits<5>  RT;
    
-  let Inst{6-10}  = RT;
-  let Inst{11-20} = 0;
-  let Inst{21-30} = xo;
+  let Inst{6...10}  = RT;
+  let Inst{11...20} = 0;
+  let Inst{21...30} = xo;
   let Inst{31}    = 0;
 }
 
@@ -1735,9 +1735,9 @@ class XFXForm_3p<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   bits<10> imm;
   let Pattern = pattern;
 
-  let Inst{6-10}  = RT;
-  let Inst{11-20} = imm;
-  let Inst{21-30} = xo;
+  let Inst{6...10}  = RT;
+  let Inst{11...20} = imm;
+  let Inst{21...30} = xo;
   let Inst{31}    = 0;
 }
 
@@ -1747,11 +1747,11 @@ class XFXForm_5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   bits<8>  FXM;
   bits<5>  RST;
 
-  let Inst{6-10}  = RST;
+  let Inst{6...10}  = RST;
   let Inst{11}    = 0;
-  let Inst{12-19} = FXM;
+  let Inst{12...19} = FXM;
   let Inst{20}    = 0;
-  let Inst{21-30} = xo;
+  let Inst{21...30} = xo;
   let Inst{31}    = 0;
 }
 
@@ -1761,11 +1761,11 @@ class XFXForm_5a<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   bits<5>  RST;
   bits<8>  FXM;
 
-  let Inst{6-10}  = RST;
+  let Inst{6...10}  = RST;
   let Inst{11}    = 1;
-  let Inst{12-19} = FXM;
+  let Inst{12...19} = FXM;
   let Inst{20}    = 0;
-  let Inst{21-30} = xo;
+  let Inst{21...30} = xo;
   let Inst{31}    = 0;
 }
 
@@ -1782,10 +1782,10 @@ class XFLForm<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   let Pattern = pattern;
 
   let Inst{6} = 0;
-  let Inst{7-14}  = FM;
+  let Inst{7...14}  = FM;
   let Inst{15} = 0;
-  let Inst{16-20} = RT;
-  let Inst{21-30} = xo;
+  let Inst{16...20} = RT;
+  let Inst{21...30} = xo;
   let Inst{31}    = RC;
 }
 
@@ -1801,10 +1801,10 @@ class XFLForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   let Pattern = pattern;
 
   let Inst{6}     = L;
-  let Inst{7-14}  = FLM;
+  let Inst{7...14}  = FLM;
   let Inst{15}    = W;
-  let Inst{16-20} = FRB;
-  let Inst{21-30} = xo;
+  let Inst{16...20} = FRB;
+  let Inst{21...30} = xo;
   let Inst{31}    = RC;
 }
 
@@ -1819,10 +1819,10 @@ class XSForm_1<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr,
   bit RC = 0;    // set by isRecordForm
   let Pattern = pattern;
 
-  let Inst{6-10}  = RS;
-  let Inst{11-15} = RA;
-  let Inst{16-20} = SH{4,3,2,1,0};
-  let Inst{21-29} = xo;
+  let Inst{6...10}  = RS;
+  let Inst{11...15} = RA;
+  let Inst{16...20} = SH{4,3,2,1,0};
+  let Inst{21...29} = xo;
   let Inst{30}    = SH{5};
   let Inst{31}    = RC;
 }
@@ -1839,11 +1839,11 @@ class XOForm_1<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL, string asms
 
   bit RC = 0;    // set by isRecordForm
 
-  let Inst{6-10}  = RT;
-  let Inst{11-15} = RA;
-  let Inst{16-20} = RB;
+  let Inst{6...10}  = RT;
+  let Inst{11...15} = RA;
+  let Inst{16...20} = RB;
   let Inst{21}    = oe;
-  let Inst{22-30} = xo;
+  let Inst{22...30} = xo;
   let Inst{31}    = RC;  
 }
 
@@ -1866,11 +1866,11 @@ class AForm_1<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
 
   bit RC = 0;    // set by isRecordForm
 
-  let Inst{6-10}  = FRT;
-  let Inst{11-15} = FRA;
-  let Inst{16-20} = FRB;
-  let Inst{21-25} = FRC;
-  let Inst{26-30} = xo;
+  let Inst{6...10}  = FRT;
+  let Inst{11...15} = FRA;
+  let Inst{16...20} = FRB;
+  let Inst{21...25} = FRC;
+  let Inst{26...30} = xo;
   let Inst{31}    = RC;
 }
 
@@ -1896,11 +1896,11 @@ class AForm_4<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
 
   let Pattern = pattern;
 
-  let Inst{6-10}  = RT;
-  let Inst{11-15} = RA;
-  let Inst{16-20} = RB;
-  let Inst{21-25} = COND;
-  let Inst{26-30} = xo;
+  let Inst{6...10}  = RT;
+  let Inst{11...15} = RA;
+  let Inst{16...20} = RB;
+  let Inst{21...25} = COND;
+  let Inst{26...30} = xo;
   let Inst{31}    = 0;
 }
 
@@ -1918,11 +1918,11 @@ class MForm_1<bits<6> opcode, dag OOL, dag IOL, string asmstr,
 
   bit RC = 0;    // set by isRecordForm
 
-  let Inst{6-10}  = RS;
-  let Inst{11-15} = RA;
-  let Inst{16-20} = RB;
-  let Inst{21-25} = MB;
-  let Inst{26-30} = ME;
+  let Inst{6...10}  = RS;
+  let Inst{11...15} = RA;
+  let Inst{16...20} = RB;
+  let Inst{21...25} = MB;
+  let Inst{26...30} = ME;
   let Inst{31}    = RC;
 }
 
@@ -1939,11 +1939,11 @@ class MForm_2<bits<6> opcode, dag OOL, dag IOL, string asmstr,
 
   bit RC = 0;    // set by isRecordForm
 
-  let Inst{6-10}  = RS;
-  let Inst{11-15} = RA;
-  let Inst{16-20} = SH;
-  let Inst{21-25} = MB;
-  let Inst{26-30} = ME;
+  let Inst{6...10}  = RS;
+  let Inst{11...15} = RA;
+  let Inst{16...20} = SH;
+  let Inst{21...25} = MB;
+  let Inst{26...30} = ME;
   let Inst{31}    = RC;
 }
 
@@ -1960,11 +1960,11 @@ class MDForm_1<bits<6> opcode, bits<3> xo, dag OOL, dag IOL, string asmstr,
 
   bit RC = 0;    // set by isRecordForm
 
-  let Inst{6-10}  = RS;
-  let Inst{11-15} = RA;
-  let Inst{16-20} = SH{4,3,2,1,0};
-  let Inst{21-26} = MBE{4,3,2,1,0,5};
-  let Inst{27-29} = xo;
+  let Inst{6...10}  = RS;
+  let Inst{11...15} = RA;
+  let Inst{16...20} = SH{4,3,2,1,0};
+  let Inst{21...26} = MBE{4,3,2,1,0,5};
+  let Inst{27...29} = xo;
   let Inst{30}    = SH{5};
   let Inst{31}    = RC;
 }
@@ -1981,11 +1981,11 @@ class MDSForm_1<bits<6> opcode, bits<4> xo, dag OOL, dag IOL, string asmstr,
 
   bit RC = 0;    // set by isRecordForm
 
-  let Inst{6-10}  = RS;
-  let Inst{11-15} = RA;
-  let Inst{16-20} = RB;
-  let Inst{21-26} = MBE{4,3,2,1,0,5};
-  let Inst{27-30} = xo;
+  let Inst{6...10}  = RS;
+  let Inst{11...15} = RA;
+  let Inst{16...20} = RB;
+  let Inst{21...26} = MBE{4,3,2,1,0,5};
+  let Inst{27...30} = xo;
   let Inst{31}    = RC;
 }
 
@@ -2003,11 +2003,11 @@ class VAForm_1<bits<6> xo, dag OOL, dag IOL, string asmstr,
 
   let Pattern = pattern;
   
-  let Inst{6-10}  = RT;
-  let Inst{11-15} = RA;
-  let Inst{16-20} = RB;
-  let Inst{21-25} = RC;
-  let Inst{26-31} = xo;
+  let Inst{6...10}  = RT;
+  let Inst{11...15} = RA;
+  let Inst{16...20} = RB;
+  let Inst{21...25} = RC;
+  let Inst{26...31} = xo;
 }
 
 // VAForm_1a - DABC ordering.
@@ -2021,11 +2021,11 @@ class VAForm_1a<bits<6> xo, dag OOL, dag IOL, string asmstr,
 
   let Pattern = pattern;
   
-  let Inst{6-10}  = RT;
-  let Inst{11-15} = RA;
-  let Inst{16-20} = RB;
-  let Inst{21-25} = RC;
-  let Inst{26-31} = xo;
+  let Inst{6...10}  = RT;
+  let Inst{11...15} = RA;
+  let Inst{16...20} = RB;
+  let Inst{21...25} = RC;
+  let Inst{26...31} = xo;
 }
 
 class VAForm_2<bits<6> xo, dag OOL, dag IOL, string asmstr,
@@ -2038,12 +2038,12 @@ class VAForm_2<bits<6> xo, dag OOL, dag IOL, string asmstr,
 
   let Pattern = pattern;
   
-  let Inst{6-10}  = RT;
-  let Inst{11-15} = RA;
-  let Inst{16-20} = RB;
+  let Inst{6...10}  = RT;
+  let Inst{11...15} = RA;
+  let Inst{16...20} = RB;
   let Inst{21}    = 0;
-  let Inst{22-25} = SH;
-  let Inst{26-31} = xo;
+  let Inst{22...25} = SH;
+  let Inst{26...31} = xo;
 }
 
 // E-2 VX-Form
@@ -2056,10 +2056,10 @@ class VXForm_1<bits<11> xo, dag OOL, dag IOL, string asmstr,
   
   let Pattern = pattern;
   
-  let Inst{6-10}  = VD;
-  let Inst{11-15} = VA;
-  let Inst{16-20} = VB;
-  let Inst{21-31} = xo;
+  let Inst{6...10}  = VD;
+  let Inst{11...15} = VA;
+  let Inst{16...20} = VB;
+  let Inst{21...31} = xo;
 }
 
 class VXForm_setzero<bits<11> xo, dag OOL, dag IOL, string asmstr,
@@ -2078,10 +2078,10 @@ class VXForm_2<bits<11> xo, dag OOL, dag IOL, string asmstr,
   
   let Pattern = pattern;
   
-  let Inst{6-10}  = VD;
-  let Inst{11-15} = 0;
-  let Inst{16-20} = VB;
-  let Inst{21-31} = xo;
+  let Inst{6...10}  = VD;
+  let Inst{11...15} = 0;
+  let Inst{16...20} = VB;
+  let Inst{21...31} = xo;
 }
 
 class VXForm_3<bits<11> xo, dag OOL, dag IOL, string asmstr,
@@ -2092,10 +2092,10 @@ class VXForm_3<bits<11> xo, dag OOL, dag IOL, string asmstr,
   
   let Pattern = pattern;
   
-  let Inst{6-10}  = VD;
-  let Inst{11-15} = IMM;
-  let Inst{16-20} = 0;
-  let Inst{21-31} = xo;
+  let Inst{6...10}  = VD;
+  let Inst{11...15} = IMM;
+  let Inst{16...20} = 0;
+  let Inst{21...31} = xo;
 }
 
 /// VXForm_4 - VX instructions with "VD,0,0" register fields, like mfvscr.
@@ -2106,10 +2106,10 @@ class VXForm_4<bits<11> xo, dag OOL, dag IOL, string asmstr,
   
   let Pattern = pattern;
   
-  let Inst{6-10}  = VD;
-  let Inst{11-15} = 0;
-  let Inst{16-20} = 0;
-  let Inst{21-31} = xo;
+  let Inst{6...10}  = VD;
+  let Inst{11...15} = 0;
+  let Inst{16...20} = 0;
+  let Inst{21...31} = xo;
 }
 
 /// VXForm_5 - VX instructions with "0,0,VB" register fields, like mtvscr.
@@ -2120,10 +2120,10 @@ class VXForm_5<bits<11> xo, dag OOL, dag IOL, string asmstr,
   
   let Pattern = pattern;
   
-  let Inst{6-10}  = 0;
-  let Inst{11-15} = 0;
-  let Inst{16-20} = VB;
-  let Inst{21-31} = xo;
+  let Inst{6...10}  = 0;
+  let Inst{11...15} = 0;
+  let Inst{16...20} = VB;
+  let Inst{21...31} = xo;
 }
 
 // e.g. [PO VRT EO VRB XO]
@@ -2135,10 +2135,10 @@ class VXForm_RD5_XO5_RS5<bits<11> xo, bits<5> eo, dag OOL, dag IOL,
 
   let Pattern = pattern;
 
-  let Inst{6-10}  = VD;
-  let Inst{11-15} = eo;
-  let Inst{16-20} = VB;
-  let Inst{21-31} = xo;
+  let Inst{6...10}  = VD;
+  let Inst{11...15} = eo;
+  let Inst{16...20} = VB;
+  let Inst{21...31} = xo;
 }
 
 /// VXForm_CR - VX crypto instructions with "VRT, VRA, ST, SIX"
@@ -2152,11 +2152,11 @@ class VXForm_CR<bits<11> xo, dag OOL, dag IOL, string asmstr,
   
   let Pattern = pattern;
   
-  let Inst{6-10}  = VD;
-  let Inst{11-15} = VA;
+  let Inst{6...10}  = VD;
+  let Inst{11...15} = VA;
   let Inst{16} =  ST;
-  let Inst{17-20} = SIX;
-  let Inst{21-31} = xo;
+  let Inst{17...20} = SIX;
+  let Inst{21...31} = xo;
 }
 
 /// VXForm_BX - VX crypto instructions with "VRT, VRA, 0 - like vsbox"
@@ -2168,10 +2168,10 @@ class VXForm_BX<bits<11> xo, dag OOL, dag IOL, string asmstr,
   
   let Pattern = pattern;
   
-  let Inst{6-10}  = VD;
-  let Inst{11-15} = VA;
-  let Inst{16-20} = 0;
-  let Inst{21-31} = xo;
+  let Inst{6...10}  = VD;
+  let Inst{11...15} = VA;
+  let Inst{16...20} = 0;
+  let Inst{21...31} = xo;
 }
 
 // E-4 VXR-Form
@@ -2185,11 +2185,11 @@ class VXRForm_1<bits<10> xo, dag OOL, dag IOL, string asmstr,
   
   let Pattern = pattern;
   
-  let Inst{6-10}  = VD;
-  let Inst{11-15} = VA;
-  let Inst{16-20} = VB;
+  let Inst{6...10}  = VD;
+  let Inst{11...15} = VA;
+  let Inst{16...20} = VB;
   let Inst{21}    = RC;
-  let Inst{22-31} = xo;
+  let Inst{22...31} = xo;
 }
 
 // VX-Form: [PO VRT EO VRB 1 PS XO]
@@ -2203,12 +2203,12 @@ class VX_RD5_EO5_RS5_PS1_XO9<bits<5> eo, bits<9> xo,
 
   let Pattern = pattern;
 
-  let Inst{6-10}  = VD;
-  let Inst{11-15} = eo;
-  let Inst{16-20} = VB;
+  let Inst{6...10}  = VD;
+  let Inst{11...15} = eo;
+  let Inst{16...20} = VB;
   let Inst{21}    = 1;
   let Inst{22}    = PS;
-  let Inst{23-31} = xo;
+  let Inst{23...31} = xo;
 }
 
 // VX-Form: [PO VRT VRA VRB 1 PS XO] or [PO VRT VRA VRB 1 / XO]
@@ -2222,12 +2222,12 @@ class VX_RD5_RSp5_PS1_XO9<bits<9> xo, dag OOL, dag IOL, string asmstr,
 
   let Pattern = pattern;
 
-  let Inst{6-10}  = VD;
-  let Inst{11-15} = VA;
-  let Inst{16-20} = VB;
+  let Inst{6...10}  = VD;
+  let Inst{11...15} = VA;
+  let Inst{16...20} = VB;
   let Inst{21}    = 1;
   let Inst{22}    = PS;
-  let Inst{23-31} = xo;
+  let Inst{23...31} = xo;
 }
 
 class Z22Form_BF3_FRA5_DCM6<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
@@ -2240,11 +2240,11 @@ class Z22Form_BF3_FRA5_DCM6<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
 
   let Pattern = pattern;
 
-  let Inst{6-8}   = BF;
-  let Inst{9-10}  = 0;
-  let Inst{11-15} = FRA;
-  let Inst{16-21} = DCM;
-  let Inst{22-30} = xo;
+  let Inst{6...8}   = BF;
+  let Inst{9...10}  = 0;
+  let Inst{11...15} = FRA;
+  let Inst{16...21} = DCM;
+  let Inst{22...30} = xo;
   let Inst{31}    = 0;
 }
 
@@ -2260,10 +2260,10 @@ class Z22Form_FRTA5_SH6<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
 
   bit RC = 0; // set by isRecordForm
 
-  let Inst{6 - 10} = FRT;
-  let Inst{11 - 15} = FRA;
-  let Inst{16 - 21} = SH;
-  let Inst{22 - 30} = xo;
+  let Inst{6...10} = FRT;
+  let Inst{11...15} = FRA;
+  let Inst{16...21} = SH;
+  let Inst{22...30} = xo;
   let Inst{31} = RC;
 }
 
@@ -2279,12 +2279,12 @@ class Z23Form_8<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
 
   bit RC = 0;    // set by isRecordForm
 
-  let Inst{6-10}  = VRT;
-  let Inst{11-14} = 0;
+  let Inst{6...10}  = VRT;
+  let Inst{11...14} = 0;
   let Inst{15} = R;
-  let Inst{16-20} = VRB;
-  let Inst{21-22} = idx;
-  let Inst{23-30} = xo;
+  let Inst{16...20} = VRB;
+  let Inst{21...22} = idx;
+  let Inst{23...30} = xo;
   let Inst{31}    = RC;
 }
 
@@ -2298,11 +2298,11 @@ class Z23Form_RTAB5_CY2<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
 
   let Pattern = pattern;
 
-  let Inst{6-10}  = RT;
-  let Inst{11-15} = RA;
-  let Inst{16-20} = RB;
-  let Inst{21-22} = CY;
-  let Inst{23-30} = xo;
+  let Inst{6...10}  = RT;
+  let Inst{11...15} = RA;
+  let Inst{16...20} = RB;
+  let Inst{21...22} = CY;
+  let Inst{23...30} = xo;
   let Inst{31} = 0;
 }
 
@@ -2318,11 +2318,11 @@ class Z23Form_FRTAB5_RMC2<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
 
   bit RC = 0; // set by isRecordForm
 
-  let Inst{6 - 10} = FRT;
-  let Inst{11 - 15} = FRA;
-  let Inst{16 - 20} = FRB;
-  let Inst{21 - 22} = RMC;
-  let Inst{23 - 30} = xo;
+  let Inst{6...10} = FRT;
+  let Inst{11...15} = FRA;
+  let Inst{16...20} = FRB;
+  let Inst{21...22} = RMC;
+  let Inst{23...30} = xo;
   let Inst{31} = RC;
 }
 
@@ -2345,12 +2345,12 @@ class Z23Form_FRTB5_R1_RMC2<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
 
   bit RC = 0; // set by isRecordForm
 
-  let Inst{6 - 10} = FRT;
-  let Inst{11 - 14} = 0;
+  let Inst{6...10} = FRT;
+  let Inst{11...14} = 0;
   let Inst{15} = R;
-  let Inst{16 - 20} = FRB;
-  let Inst{21 - 22} = RMC;
-  let Inst{23 - 30} = xo;
+  let Inst{16...20} = FRB;
+  let Inst{21...22} = RMC;
+  let Inst{23...30} = xo;
   let Inst{31} = RC;
 }
 
@@ -2362,7 +2362,7 @@ class PPCEmitTimePseudo<dag OOL, dag IOL, string asmstr, list<dag> pattern>
   let isCodeGenOnly = 1;
   let PPC64 = 0;
   let Pattern = pattern;
-  let Inst{31-0} = 0;
+  let Inst{31...0} = 0;
   let hasNoSchedulingInfo = 1;
 }
 
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
index 80fac18d5737..a12dfae2a0d7 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
@@ -13,7 +13,7 @@
 
 class XOForm_RTAB5_L1<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
                       string asmstr, list<dag> pattern>
-         : I<opcode, OOL, IOL, asmstr, NoItinerary> {
+    : I<opcode, OOL, IOL, asmstr, NoItinerary> {
   bits<5> RT;
   bits<5> RA;
   bits<5> RB;
@@ -21,64 +21,174 @@ class XOForm_RTAB5_L1<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
 
   let Pattern = pattern;
 
-  bit RC = 0;    // set by isRecordForm
+  bit RC = 0; // set by isRecordForm
 
-  let Inst{6-10}  = RT;
-  let Inst{11-15} = RA;
-  let Inst{16-20} = RB;
-  let Inst{21}    = L;
-  let Inst{22-30} = xo;
-  let Inst{31}    = RC;
+  let Inst{6...10} = RT;
+  let Inst{11...15} = RA;
+  let Inst{16...20} = RB;
+  let Inst{21} = L;
+  let Inst{22...30} = xo;
+  let Inst{31} = RC;
 }
 
 multiclass XOForm_RTAB5_L1r<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
-                            string asmbase, string asmstr,
-                            list<dag> pattern> {
+                            string asmbase, string asmstr, list<dag> pattern> {
   let BaseName = asmbase in {
     def NAME : XOForm_RTAB5_L1<opcode, xo, OOL, IOL,
                                !strconcat(asmbase, !strconcat(" ", asmstr)),
-                               pattern>, RecFormRel;
-    let Defs = [CR0] in
-    def _rec : XOForm_RTAB5_L1<opcode, xo, OOL, IOL,
-                               !strconcat(asmbase, !strconcat(". ", asmstr)),
-                               []>, isRecordForm, RecFormRel;
+                               pattern>,
+               RecFormRel;
+    let Defs = [CR0] in def _rec
+        : XOForm_RTAB5_L1<opcode, xo, OOL, IOL,
+                          !strconcat(asmbase, !strconcat(". ", asmstr)), []>,
+        isRecordForm, RecFormRel;
   }
 }
 
+class VXForm_VRTB5<bits<11> xo, bits<5> R, dag OOL, dag IOL, string asmstr,
+                   list<dag> pattern> : I<4, OOL, IOL, asmstr, NoItinerary> {
+  bits<5> VRT;
+  bits<5> VRB;
+
+  let Pattern = pattern;
+
+  let Inst{6...10} = VRT;
+  let Inst{11...15} = R;
+  let Inst{16...20} = VRB;
+  let Inst{21...31} = xo;
+}
+
+class VXForm_VRTB5_UIM2<bits<11> xo, bits<3> R, dag OOL, dag IOL, string asmstr,
+                        list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, NoItinerary> {
+  bits<5> VRT;
+  bits<5> VRB;
+  bits<2> UIM;
+
+  let Pattern = pattern;
+
+  let Inst{6...10} = VRT;
+  let Inst{11...13} = R;
+  let Inst{14...15} = UIM;
+  let Inst{16...20} = VRB;
+  let Inst{21...31} = xo;
+}
+
+class VXForm_VRTB5_UIM1<bits<11> xo, bits<4> R, dag OOL, dag IOL, string asmstr,
+                        list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, NoItinerary> {
+  bits<5> VRT;
+  bits<5> VRB;
+  bits<1> UIM;
+
+  let Pattern = pattern;
+
+  let Inst{6...10} = VRT;
+  let Inst{11...14} = R;
+  let Inst{15} = UIM;
+  let Inst{16...20} = VRB;
+  let Inst{21...31} = xo;
+}
+
+class VXForm_VRTB5_UIM3<bits<11> xo, bits<2> R, dag OOL, dag IOL, string asmstr,
+                        list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, NoItinerary> {
+  bits<5> VRT;
+  bits<5> VRB;
+  bits<3> UIM;
+
+  let Pattern = pattern;
+
+  let Inst{6...10} = VRT;
+  let Inst{11...12} = R;
+  let Inst{13...15} = UIM;
+  let Inst{16...20} = VRB;
+  let Inst{21...31} = xo;
+}
+
+class VXForm_VRTAB5<bits<11> xo, dag OOL, dag IOL, string asmstr,
+                    list<dag> pattern> : I<4, OOL, IOL, asmstr, NoItinerary> {
+  bits<5> VRT;
+  bits<5> VRA;
+  bits<5> VRB;
+
+  let Pattern = pattern;
+
+  let Inst{6...10} = VRT;
+  let Inst{11...15} = VRA;
+  let Inst{16...20} = VRB;
+  let Inst{21...31} = xo;
+}
+
 let Predicates = [IsISAFuture] in {
   defm SUBFUS : XOForm_RTAB5_L1r<31, 72, (outs g8rc:$RT),
-                                 (ins g8rc:$RA, g8rc:$RB, u1imm:$L),
-                                 "subfus",  "$RT, $L, $RA, $RB", []>;
+                                 (ins g8rc:$RA, g8rc:$RB, u1imm:$L), "subfus",
+                                 "$RT, $L, $RA, $RB", []>;
 }
 
 let Predicates = [HasVSX, IsISAFuture] in {
   let mayLoad = 1 in {
-    def LXVRL
-        : XX1Form_memOp<31, 525, (outs vsrc:$XT), (ins memr:$RA, g8rc:$RB),
-                        "lxvrl $XT, $RA, $RB", IIC_LdStLoad, []>;
-    def LXVRLL
-        : XX1Form_memOp<31, 557, (outs vsrc:$XT), (ins memr:$RA, g8rc:$RB),
-                        "lxvrll $XT, $RA, $RB", IIC_LdStLoad, []>;
-    def LXVPRL
-        : XForm_XTp5_XAB5<31, 589, (outs vsrprc:$XTp), (ins memr:$RA, g8rc:$RB),
-                          "lxvprl $XTp, $RA, $RB", IIC_LdStLFD, []>;
-    def LXVPRLL
-        : XForm_XTp5_XAB5<31, 621, (outs vsrprc:$XTp), (ins memr:$RA, g8rc:$RB),
-                          "lxvprll $XTp, $RA, $RB", IIC_LdStLFD, []>;
+    def LXVRL : XX1Form_memOp<31, 525, (outs vsrc:$XT),
+                              (ins (memr $RA):$addr, g8rc:$RB),
+                              "lxvrl $XT, $addr, $RB", IIC_LdStLoad, []>;
+    def LXVRLL : XX1Form_memOp<31, 557, (outs vsrc:$XT),
+                               (ins (memr $RA):$addr, g8rc:$RB),
+                               "lxvrll $XT, $addr, $RB", IIC_LdStLoad, []>;
+    def LXVPRL : XForm_XTp5_XAB5<31, 589, (outs vsrprc:$XTp),
+                                 (ins (memr $RA):$addr, g8rc:$RB),
+                                 "lxvprl $XTp, $addr, $RB", IIC_LdStLFD, []>;
+    def LXVPRLL : XForm_XTp5_XAB5<31, 621, (outs vsrprc:$XTp),
+                                  (ins (memr $RA):$addr, g8rc:$RB),
+                                  "lxvprll $XTp, $addr, $RB", IIC_LdStLFD, []>;
   }
 
   let mayStore = 1 in {
-    def STXVRL
-        : XX1Form_memOp<31, 653, (outs), (ins vsrc:$XT, memr:$RA, g8rc:$RB),
-                        "stxvrl $XT, $RA, $RB", IIC_LdStLoad, []>;
-    def STXVRLL
-        : XX1Form_memOp<31, 685, (outs), (ins vsrc:$XT, memr:$RA, g8rc:$RB),
-                        "stxvrll $XT, $RA, $RB", IIC_LdStLoad, []>;
+    def STXVRL : XX1Form_memOp<31, 653, (outs),
+                               (ins vsrc:$XT, (memr $RA):$addr, g8rc:$RB),
+                               "stxvrl $XT, $addr, $RB", IIC_LdStLoad, []>;
+    def STXVRLL : XX1Form_memOp<31, 685, (outs),
+                                (ins vsrc:$XT, (memr $RA):$addr, g8rc:$RB),
+                                "stxvrll $XT, $addr, $RB", IIC_LdStLoad, []>;
     def STXVPRL : XForm_XTp5_XAB5<31, 717, (outs),
-                                  (ins vsrprc:$XTp, memr:$RA, g8rc:$RB),
-                                  "stxvprl $XTp, $RA, $RB", IIC_LdStLFD, []>;
-    def STXVPRLL : XForm_XTp5_XAB5<31, 749, (outs),
-                                   (ins vsrprc:$XTp, memr:$RA, g8rc:$RB),
-                                   "stxvprll $XTp, $RA, $RB", IIC_LdStLFD, []>;
+                                  (ins vsrprc:$XTp, (memr $RA):$addr, g8rc:$RB),
+                                  "stxvprl $XTp, $addr, $RB", IIC_LdStLFD, []>;
+    def STXVPRLL
+        : XForm_XTp5_XAB5<31, 749, (outs),
+                          (ins vsrprc:$XTp, (memr $RA):$addr, g8rc:$RB),
+                          "stxvprll $XTp, $addr, $RB", IIC_LdStLFD, []>;
   }
+
+  def VUPKHSNTOB : VXForm_VRTB5<387, 0, (outs vrrc:$VRT), (ins vrrc:$VRB),
+                                "vupkhsntob $VRT, $VRB", []>;
+  def VUPKLSNTOB : VXForm_VRTB5<387, 1, (outs vrrc:$VRT), (ins vrrc:$VRB),
+                                "vupklsntob $VRT, $VRB", []>;
+  def VUPKINT4TOBF16
+      : VXForm_VRTB5_UIM2<387, 2, (outs vrrc:$VRT), (ins vrrc:$VRB, u2imm:$UIM),
+                          "vupkint4tobf16 $VRT, $VRB, $UIM", []>;
+  def VUPKINT8TOBF16
+      : VXForm_VRTB5_UIM1<387, 1, (outs vrrc:$VRT), (ins vrrc:$VRB, u1imm:$UIM),
+                          "vupkint8tobf16 $VRT, $VRB, $UIM", []>;
+  def VUPKINT8TOFP32
+      : VXForm_VRTB5_UIM2<387, 3, (outs vrrc:$VRT), (ins vrrc:$VRB, u2imm:$UIM),
+                          "vupkint8tofp32 $VRT, $VRB, $UIM", []>;
+  def VUPKINT4TOFP32
+      : VXForm_VRTB5_UIM3<387, 2, (outs vrrc:$VRT), (ins vrrc:$VRB, u3imm:$UIM),
+                          "vupkint4tofp32 $VRT, $VRB, $UIM", []>;
+
+  def VUCMPRHN : VXForm_VRTAB5<3, (outs vrrc:$VRT), (ins vrrc:$VRA, vrrc:$VRB),
+                               "vucmprhn $VRT, $VRA, $VRB", []>;
+  def VUCMPRLN : VXForm_VRTAB5<67, (outs vrrc:$VRT), (ins vrrc:$VRA, vrrc:$VRB),
+                               "vucmprln $VRT, $VRA, $VRB", []>;
+  def VUCMPRHB
+      : VXForm_VRTAB5<131, (outs vrrc:$VRT), (ins vrrc:$VRA, vrrc:$VRB),
+                      "vucmprhb $VRT, $VRA, $VRB", []>;
+  def VUCMPRLB
+      : VXForm_VRTAB5<195, (outs vrrc:$VRT), (ins vrrc:$VRA, vrrc:$VRB),
+                      "vucmprlb $VRT, $VRA, $VRB", []>;
+  def VUCMPRHH
+      : VXForm_VRTAB5<259, (outs vrrc:$VRT), (ins vrrc:$VRA, vrrc:$VRB),
+                      "vucmprhh $VRT, $VRA, $VRB", []>;
+  def VUCMPRLH
+      : VXForm_VRTAB5<323, (outs vrrc:$VRT), (ins vrrc:$VRA, vrrc:$VRB),
+                      "vucmprlh $VRT, $VRA, $VRB", []>;
 }
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td b/llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td
index ef8b27f9b8d3..884895793752 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td
@@ -8,12 +8,13 @@
 //===----------------------------------------------------------------------===//
 //
 // This file describes the instructions introduced for the Future CPU for MMA.
+// Please reference "PPCInstrVSX.td" for file structure.
 //
 //===----------------------------------------------------------------------===//
 
 class XX3Form_AT3_XABp5_P1<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
                            string asmstr, list<dag> pattern>
-  : I<opcode, OOL, IOL, asmstr, NoItinerary> {
+    : I<opcode, OOL, IOL, asmstr, NoItinerary> {
   bits<3> AT;
   bits<5> XAp;
   bits<5> XBp;
@@ -21,13 +22,13 @@ class XX3Form_AT3_XABp5_P1<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
 
   let Pattern = pattern;
 
-  let Inst{6-8} = AT{2-0};
-  let Inst{9-10} = 0;
-  let Inst{11-14} = XAp{3-0};
+  let Inst{6...8} = AT{2...0};
+  let Inst{9...10} = 0;
+  let Inst{11...14} = XAp{3...0};
   let Inst{15} = P;
-  let Inst{16-19} = XBp{3-0};
+  let Inst{16...19} = XBp{3...0};
   let Inst{20} = 0;
-  let Inst{21-28} = xo;
+  let Inst{21...28} = xo;
   let Inst{29} = XAp{4};
   let Inst{30} = XBp{4};
   let Inst{31} = 0;
@@ -35,65 +36,64 @@ class XX3Form_AT3_XABp5_P1<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
 
 class XX2Form_AT3_XBp5_P2<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
                           string asmstr, list<dag> pattern>
-  : I<opcode, OOL, IOL, asmstr, NoItinerary> {
+    : I<opcode, OOL, IOL, asmstr, NoItinerary> {
   bits<3> AT;
   bits<5> XBp;
   bits<2> P;
 
   let Pattern = pattern;
 
-  let Inst{6-8} = AT{2-0};
-  let Inst{9-14} = 0;
+  let Inst{6...8} = AT{2...0};
+  let Inst{9...14} = 0;
   let Inst{15} = P{0};
-  let Inst{16-19} = XBp{3-0};
+  let Inst{16...19} = XBp{3...0};
   let Inst{20} = P{1};
-  let Inst{21-29} = xo;
+  let Inst{21...29} = xo;
   let Inst{30} = XBp{4};
   let Inst{31} = 0;
 }
 
 class XForm_ATB3<bits<6> opcode, bits<5> o, bits<10> xo, dag OOL, dag IOL,
                  string asmstr, list<dag> pattern>
-  : I <opcode, OOL, IOL, asmstr, NoItinerary> {
+    : I<opcode, OOL, IOL, asmstr, NoItinerary> {
   bits<3> AT;
   bits<3> AB;
 
   let Pattern = pattern;
 
-  let Inst{6-8} = AT{2-0};
-  let Inst{9-10} = 0;
-  let Inst{11-15} = o;
-  let Inst{16-18} = AB{2-0};
-  let Inst{19-20} = 0;
-  let Inst{21-30} = xo;
+  let Inst{6...8} = AT{2...0};
+  let Inst{9...10} = 0;
+  let Inst{11...15} = o;
+  let Inst{16...18} = AB{2...0};
+  let Inst{19...20} = 0;
+  let Inst{21...30} = xo;
   let Inst{31} = 0;
 }
 
 class XX3Form_AT3_XAp5B6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
-                           string asmstr, InstrItinClass itin,
-                           list<dag> pattern>
-  : I<opcode, OOL, IOL, asmstr, itin> {
+                         string asmstr, InstrItinClass itin, list<dag> pattern>
+    : I<opcode, OOL, IOL, asmstr, itin> {
   bits<3> AT;
   bits<5> XAp;
   bits<6> XB;
 
   let Pattern = pattern;
 
-  let Inst{6-8} = AT;
-  let Inst{9-10} = 0;
-  let Inst{11-14} = XAp{3-0};
+  let Inst{6...8} = AT;
+  let Inst{9...10} = 0;
+  let Inst{11...14} = XAp{3...0};
   let Inst{15} = 0;
-  let Inst{16-20} = XB{4-0};
-  let Inst{21-28} = xo;
-  let Inst{29}    = XAp{4};
-  let Inst{30}    = XB{5};
+  let Inst{16...20} = XB{4...0};
+  let Inst{21...28} = xo;
+  let Inst{29} = XAp{4};
+  let Inst{30} = XB{5};
   let Inst{31} = 0;
 }
 
 class MMIRR_XX3Form_X8YP4_XAp5B6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
                                  string asmstr, InstrItinClass itin,
                                  list<dag> pattern>
-  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+    : PI<1, opcode, OOL, IOL, asmstr, itin> {
   bits<3> AT;
   bits<5> XAp;
   bits<6> XB;
@@ -104,29 +104,29 @@ class MMIRR_XX3Form_X8YP4_XAp5B6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
   let Pattern = pattern;
 
   // The prefix.
-  let Inst{6-7} = 3;
-  let Inst{8-11} = 9;
-  let Inst{12-15} = 0;
-  let Inst{16-19} = PMSK;
-  let Inst{20-27} = XMSK;
-  let Inst{28-31} = YMSK;
+  let Inst{6...7} = 3;
+  let Inst{8...11} = 9;
+  let Inst{12...15} = 0;
+  let Inst{16...19} = PMSK;
+  let Inst{20...27} = XMSK;
+  let Inst{28...31} = YMSK;
 
   // The instruction.
-  let Inst{38-40} = AT;
-  let Inst{41-42} = 0;
-  let Inst{43-46} = XAp{3-0};
+  let Inst{38...40} = AT;
+  let Inst{41...42} = 0;
+  let Inst{43...46} = XAp{3...0};
   let Inst{47} = 0;
-  let Inst{48-52} = XB{4-0};
-  let Inst{53-60} = xo;
+  let Inst{48...52} = XB{4...0};
+  let Inst{53...60} = xo;
   let Inst{61} = XAp{4};
   let Inst{62} = XB{5};
   let Inst{63} = 0;
 }
 
 class MMIRR_XX3Form_X8Y4P2_XAp5B6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
-                                 string asmstr, InstrItinClass itin,
-                                 list<dag> pattern>
-  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+                                  string asmstr, InstrItinClass itin,
+                                  list<dag> pattern>
+    : PI<1, opcode, OOL, IOL, asmstr, itin> {
   bits<3> AT;
   bits<5> XAp;
   bits<6> XB;
@@ -137,21 +137,21 @@ class MMIRR_XX3Form_X8Y4P2_XAp5B6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
   let Pattern = pattern;
 
   // The prefix.
-  let Inst{6-7} = 3;
-  let Inst{8-11} = 9;
-  let Inst{12-15} = 0;
-  let Inst{16-17} = PMSK;
-  let Inst{18-19} = 0;
-  let Inst{20-27} = XMSK;
-  let Inst{28-31} = YMSK;
+  let Inst{6...7} = 3;
+  let Inst{8...11} = 9;
+  let Inst{12...15} = 0;
+  let Inst{16...17} = PMSK;
+  let Inst{18...19} = 0;
+  let Inst{20...27} = XMSK;
+  let Inst{28...31} = YMSK;
 
   // The instruction.
-  let Inst{38-40} = AT;
-  let Inst{41-42} = 0;
-  let Inst{43-46} = XAp{3-0};
+  let Inst{38...40} = AT;
+  let Inst{41...42} = 0;
+  let Inst{43...46} = XAp{3...0};
   let Inst{47} = 0;
-  let Inst{48-52} = XB{4-0};
-  let Inst{53-60} = xo;
+  let Inst{48...52} = XB{4...0};
+  let Inst{53...60} = xo;
   let Inst{61} = XAp{4};
   let Inst{62} = XB{5};
   let Inst{63} = 0;
@@ -160,14 +160,15 @@ class MMIRR_XX3Form_X8Y4P2_XAp5B6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
 multiclass DMR_UM_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
                        string asmstr> {
   let Predicates = [MMA, IsISAFuture] in {
-  def NAME :
-    XX3Form_AT3_XAp5B6<opcode, !or(xo, 0x01), (outs dmr:$AT), IOL,
-                     !strconcat(asmbase#" ", asmstr), IIC_VecFP, []>,
-    RegConstraint<"@earlyclobber $AT">;
-  def PP :
-    XX3Form_AT3_XAp5B6<opcode, xo, (outs dmr:$AT), !con((ins dmr:$ATi), IOL),
-                     !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    def NAME
+        : XX3Form_AT3_XAp5B6<opcode, !or(xo, 0x01), (outs dmr:$AT), IOL,
+                             !strconcat(asmbase#" ", asmstr), IIC_VecFP, []>,
+          RegConstraint<"@earlyclobber $AT">;
+    def PP
+        : XX3Form_AT3_XAp5B6<opcode, xo, (outs dmr:$AT),
+                             !con((ins dmr:$ATi), IOL),
+                             !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>,
+          RegConstraint<"$ATi = $AT">;
   }
 }
 
@@ -175,202 +176,217 @@ multiclass DMR_UM_M448_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
                             string asmstr> {
   defm NAME : DMR_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>;
   let Predicates = [MMA, PrefixInstrs, IsISAFuture] in {
-  def PM#NAME :
-    MMIRR_XX3Form_X8YP4_XAp5B6<
-      opcode, !or(xo, 0x01), (outs dmr:$AT),
-      !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u4imm:$PMSK)),
-      !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"),
-      IIC_VecFP, []>,
-    RegConstraint<"@earlyclobber $AT">;
-  def PM#NAME#PP :
-    MMIRR_XX3Form_X8YP4_XAp5B6<
-      opcode, xo, (outs dmr:$AT),
-      !con((ins dmr:$ATi),
-           !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u4imm:$PMSK))),
-      !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
-      IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    def PM#NAME
+        : MMIRR_XX3Form_X8YP4_XAp5B6<
+              opcode, !or(xo, 0x01), (outs dmr:$AT),
+              !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u4imm:$PMSK)),
+              !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"),
+              IIC_VecFP, []>,
+          RegConstraint<"@earlyclobber $AT">;
+    def PM#NAME#PP
+        : MMIRR_XX3Form_X8YP4_XAp5B6<
+              opcode, xo, (outs dmr:$AT),
+              !con((ins dmr:$ATi),
+                   !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u4imm:$PMSK))),
+              !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
+              IIC_VecFP, []>,
+          RegConstraint<"$ATi = $AT">;
   }
 }
 
 multiclass DMR_BF16_UM_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
-                       string asmstr> {
+                            string asmstr> {
   let Predicates = [MMA, IsISAFuture] in {
-  def NAME :
-    XX3Form_AT3_XAp5B6<opcode, !or(xo, 0x11), (outs dmr:$AT), IOL,
-                     !strconcat(asmbase#" ", asmstr), IIC_VecFP, []>,
-    RegConstraint<"@earlyclobber $AT">;
-  def PP :
-    XX3Form_AT3_XAp5B6<opcode, xo, (outs dmr:$AT), !con((ins dmr:$ATi), IOL),
-                     !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    def NAME
+        : XX3Form_AT3_XAp5B6<opcode, !or(xo, 0x11), (outs dmr:$AT), IOL,
+                             !strconcat(asmbase#" ", asmstr), IIC_VecFP, []>,
+          RegConstraint<"@earlyclobber $AT">;
+    def PP
+        : XX3Form_AT3_XAp5B6<opcode, xo, (outs dmr:$AT),
+                             !con((ins dmr:$ATi), IOL),
+                             !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>,
+          RegConstraint<"$ATi = $AT">;
   }
 }
 
-multiclass DMR_BF16_UM_M284_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
-                            string asmstr> {
+multiclass DMR_BF16_UM_M284_XOEO<bits<6> opcode, bits<8> xo, dag IOL,
+                                 string asmbase, string asmstr> {
   defm NAME : DMR_BF16_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>;
   let Predicates = [MMA, PrefixInstrs, IsISAFuture] in {
-  def PM#NAME :
-    MMIRR_XX3Form_X8Y4P2_XAp5B6<
-      opcode, !or(xo, 0x11), (outs dmr:$AT),
-      !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK)),
-      !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"),
-      IIC_VecFP, []>,
-    RegConstraint<"@earlyclobber $AT">;
-  def PM#NAME#PP :
-    MMIRR_XX3Form_X8Y4P2_XAp5B6<
-      opcode, xo, (outs dmr:$AT),
-      !con((ins dmr:$ATi), !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
-      !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
-      IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    def PM#NAME
+        : MMIRR_XX3Form_X8Y4P2_XAp5B6<
+              opcode, !or(xo, 0x11), (outs dmr:$AT),
+              !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK)),
+              !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"),
+              IIC_VecFP, []>,
+          RegConstraint<"@earlyclobber $AT">;
+    def PM#NAME#PP
+        : MMIRR_XX3Form_X8Y4P2_XAp5B6<
+              opcode, xo, (outs dmr:$AT),
+              !con((ins dmr:$ATi),
+                   !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+              !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
+              IIC_VecFP, []>,
+          RegConstraint<"$ATi = $AT">;
   }
 }
 
-multiclass DMR_F16_UM_M284_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
-                            string asmstr> {
+multiclass DMR_F16_UM_M284_XOEO<bits<6> opcode, bits<8> xo, dag IOL,
+                                string asmbase, string asmstr> {
   defm NAME : DMR_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>;
   let Predicates = [MMA, PrefixInstrs, IsISAFuture] in {
-  def PM#NAME :
-    MMIRR_XX3Form_X8Y4P2_XAp5B6<
-      opcode, !or(xo, 0x01), (outs dmr:$AT),
-      !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK)),
-      !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"),
-      IIC_VecFP, []>,
-    RegConstraint<"@earlyclobber $AT">;
-  def PM#NAME#PP :
-    MMIRR_XX3Form_X8Y4P2_XAp5B6<
-      opcode, xo, (outs dmr:$AT),
-      !con((ins dmr:$ATi), !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
-      !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
-      IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    def PM#NAME
+        : MMIRR_XX3Form_X8Y4P2_XAp5B6<
+              opcode, !or(xo, 0x01), (outs dmr:$AT),
+              !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK)),
+              !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"),
+              IIC_VecFP, []>,
+          RegConstraint<"@earlyclobber $AT">;
+    def PM#NAME#PP
+        : MMIRR_XX3Form_X8Y4P2_XAp5B6<
+              opcode, xo, (outs dmr:$AT),
+              !con((ins dmr:$ATi),
+                   !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+              !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
+              IIC_VecFP, []>,
+          RegConstraint<"$ATi = $AT">;
   }
 }
 
 multiclass DMR_NEG_UM_M284_XOXORf939a0<bits<6> opcode, bits<8> xo, dag IOL,
-                                  string asmbase, string asmstr> {
+                                       string asmbase, string asmstr> {
   defm NAME : DMR_BF16_UM_M284_XOEO<opcode, xo, IOL, asmbase, asmstr>;
   let Predicates = [MMA, IsISAFuture] in {
-  def PN : XX3Form_AT3_XAp5B6<
-             opcode, !xor(xo, 0xF9), (outs dmr:$AT), !con((ins dmr:$ATi), IOL),
-             !strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>,
-           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-  def NP : XX3Form_AT3_XAp5B6<
-             opcode, !xor(xo, 0x39), (outs dmr:$AT), !con((ins dmr:$ATi), IOL),
-             !strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>,
-           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-  def NN : XX3Form_AT3_XAp5B6<
-             opcode, !xor(xo, 0xA0), (outs dmr:$AT), !con((ins dmr:$ATi), IOL),
-             !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>,
-           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    def PN
+        : XX3Form_AT3_XAp5B6<opcode, !xor(xo, 0xF9), (outs dmr:$AT),
+                             !con((ins dmr:$ATi), IOL),
+                             !strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>,
+          RegConstraint<"$ATi = $AT">;
+    def NP
+        : XX3Form_AT3_XAp5B6<opcode, !xor(xo, 0x39), (outs dmr:$AT),
+                             !con((ins dmr:$ATi), IOL),
+                             !strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>,
+          RegConstraint<"$ATi = $AT">;
+    def NN
+        : XX3Form_AT3_XAp5B6<opcode, !xor(xo, 0xA0), (outs dmr:$AT),
+                             !con((ins dmr:$ATi), IOL),
+                             !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>,
+          RegConstraint<"$ATi = $AT">;
   }
   let Predicates = [MMA, PrefixInstrs, IsISAFuture] in {
-   def PM#NAME#PN :
-    MMIRR_XX3Form_X8Y4P2_XAp5B6<
-      opcode, !xor(xo, 0xF9), (outs dmr:$AT),
-      !con((ins dmr:$ATi), !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
-      !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK, $PMSK"),
-      IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-  def PM#NAME#NP :
-    MMIRR_XX3Form_X8Y4P2_XAp5B6<
-      opcode, !xor(xo, 0x39), (outs dmr:$AT),
-      !con((ins dmr:$ATi), !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
-      !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK, $PMSK"),
-      IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-  def PM#NAME#NN :
-    MMIRR_XX3Form_X8Y4P2_XAp5B6<
-      opcode, !xor(xo, 0xA0), (outs dmr:$AT),
-      !con((ins dmr:$ATi), !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
-      !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK, $PMSK"),
-      IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    def PM#NAME#PN
+        : MMIRR_XX3Form_X8Y4P2_XAp5B6<
+              opcode, !xor(xo, 0xF9), (outs dmr:$AT),
+              !con((ins dmr:$ATi),
+                   !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+              !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK, $PMSK"),
+              IIC_VecFP, []>,
+          RegConstraint<"$ATi = $AT">;
+    def PM#NAME#NP
+        : MMIRR_XX3Form_X8Y4P2_XAp5B6<
+              opcode, !xor(xo, 0x39), (outs dmr:$AT),
+              !con((ins dmr:$ATi),
+                   !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+              !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK, $PMSK"),
+              IIC_VecFP, []>,
+          RegConstraint<"$ATi = $AT">;
+    def PM#NAME#NN
+        : MMIRR_XX3Form_X8Y4P2_XAp5B6<
+              opcode, !xor(xo, 0xA0), (outs dmr:$AT),
+              !con((ins dmr:$ATi),
+                   !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+              !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK, $PMSK"),
+              IIC_VecFP, []>,
+          RegConstraint<"$ATi = $AT">;
   }
 }
 
 multiclass DMR_NEG_UM_M284_XOXORd11188<bits<6> opcode, bits<8> xo, dag IOL,
-                                  string asmbase, string asmstr> {
+                                       string asmbase, string asmstr> {
   defm NAME : DMR_F16_UM_M284_XOEO<opcode, xo, IOL, asmbase, asmstr>;
   let Predicates = [MMA, IsISAFuture] in {
-  def PN : XX3Form_AT3_XAp5B6<
-             opcode, !xor(xo, 0xD1), (outs dmr:$AT), !con((ins dmr:$ATi), IOL),
-             !strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>,
-           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-  def NP : XX3Form_AT3_XAp5B6<
-             opcode, !xor(xo, 0x11), (outs dmr:$AT), !con((ins dmr:$ATi), IOL),
-             !strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>,
-           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-  def NN : XX3Form_AT3_XAp5B6<
-             opcode, !xor(xo, 0x88), (outs dmr:$AT), !con((ins dmr:$ATi), IOL),
-             !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>,
-           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    def PN
+        : XX3Form_AT3_XAp5B6<opcode, !xor(xo, 0xD1), (outs dmr:$AT),
+                             !con((ins dmr:$ATi), IOL),
+                             !strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>,
+          RegConstraint<"$ATi = $AT">;
+    def NP
+        : XX3Form_AT3_XAp5B6<opcode, !xor(xo, 0x11), (outs dmr:$AT),
+                             !con((ins dmr:$ATi), IOL),
+                             !strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>,
+          RegConstraint<"$ATi = $AT">;
+    def NN
+        : XX3Form_AT3_XAp5B6<opcode, !xor(xo, 0x88), (outs dmr:$AT),
+                             !con((ins dmr:$ATi), IOL),
+                             !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>,
+          RegConstraint<"$ATi = $AT">;
   }
   let Predicates = [MMA, PrefixInstrs, IsISAFuture] in {
-   def PM#NAME#PN :
-    MMIRR_XX3Form_X8Y4P2_XAp5B6<
-      opcode, !xor(xo, 0xD1), (outs dmr:$AT),
-      !con((ins dmr:$ATi), !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
-      !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK, $PMSK"),
-      IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-  def PM#NAME#NP :
-    MMIRR_XX3Form_X8Y4P2_XAp5B6<
-      opcode, !xor(xo, 0x11), (outs dmr:$AT),
-      !con((ins dmr:$ATi), !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
-      !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK, $PMSK"),
-      IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-  def PM#NAME#NN :
-    MMIRR_XX3Form_X8Y4P2_XAp5B6<
-      opcode, !xor(xo, 0x88), (outs dmr:$AT),
-      !con((ins dmr:$ATi), !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
-      !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK, $PMSK"),
-      IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    def PM#NAME#PN
+        : MMIRR_XX3Form_X8Y4P2_XAp5B6<
+              opcode, !xor(xo, 0xD1), (outs dmr:$AT),
+              !con((ins dmr:$ATi),
+                   !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+              !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK, $PMSK"),
+              IIC_VecFP, []>,
+          RegConstraint<"$ATi = $AT">;
+    def PM#NAME#NP
+        : MMIRR_XX3Form_X8Y4P2_XAp5B6<
+              opcode, !xor(xo, 0x11), (outs dmr:$AT),
+              !con((ins dmr:$ATi),
+                   !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+              !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK, $PMSK"),
+              IIC_VecFP, []>,
+          RegConstraint<"$ATi = $AT">;
+    def PM#NAME#NN
+        : MMIRR_XX3Form_X8Y4P2_XAp5B6<
+              opcode, !xor(xo, 0x88), (outs dmr:$AT),
+              !con((ins dmr:$ATi),
+                   !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+              !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK, $PMSK"),
+              IIC_VecFP, []>,
+          RegConstraint<"$ATi = $AT">;
   }
 }
 
 class XForm_AT3_T1_AB3<bits<6> opcode, bits<5> o, bits<10> xo, dag OOL, dag IOL,
                        string asmstr, list<dag> pattern>
-  : I <opcode, OOL, IOL, asmstr, NoItinerary> {
+    : I<opcode, OOL, IOL, asmstr, NoItinerary> {
   bits<3> AT;
   bits<3> AB;
   bits<1> T;
 
   let Pattern = pattern;
 
-  let Inst{6-8} = AT{2-0};
+  let Inst{6...8} = AT{2...0};
   let Inst{9} = 0;
   let Inst{10} = T;
-  let Inst{11-15} = o;
-  let Inst{16-18} = AB{2-0};
-  let Inst{19-20} = 0;
-  let Inst{21-30} = xo;
+  let Inst{11...15} = o;
+  let Inst{16...18} = AB{2...0};
+  let Inst{19...20} = 0;
+  let Inst{21...30} = xo;
   let Inst{31} = 0;
 }
 
 class XForm_ATp2_SR5<bits<6> opcode, bits<5> o, bits<10> xo, dag OOL, dag IOL,
                      string asmstr, list<dag> pattern>
-  : I <opcode, OOL, IOL, asmstr, NoItinerary> {
+    : I<opcode, OOL, IOL, asmstr, NoItinerary> {
   bits<2> ATp;
   bits<5> SR;
 
   let Pattern = pattern;
 
-  let Inst{6-7} = ATp{1-0};
-  let Inst{8-10} = 0;
-  let Inst{11-15} = o;
-  let Inst{16-20} = SR{4-0};
-  let Inst{21-30} = xo;
+  let Inst{6...7} = ATp{1...0};
+  let Inst{8...10} = 0;
+  let Inst{11...15} = o;
+  let Inst{16...20} = SR{4...0};
+  let Inst{21...30} = xo;
   let Inst{31} = 0;
 }
 
 class XX2Form_AT3_XB6_ID2_E1_BL2<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
-                          string asmstr, list<dag> pattern>
-  : I<opcode, OOL, IOL, asmstr, NoItinerary> {
+                                 string asmstr, list<dag> pattern>
+    : I<opcode, OOL, IOL, asmstr, NoItinerary> {
   bits<3> AT;
   bits<6> XB;
   bits<2> ID;
@@ -379,41 +395,48 @@ class XX2Form_AT3_XB6_ID2_E1_BL2<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
 
   let Pattern = pattern;
 
-  let Inst{6-8} = AT{2-0};
-  let Inst{9-10} = 0;
-  let Inst{11-12} = ID{1-0};
+  let Inst{6...8} = AT{2...0};
+  let Inst{9...10} = 0;
+  let Inst{11...12} = ID{1...0};
   let Inst{13} = E;
-  let Inst{14-15} = BL{1-0};
-  let Inst{16-20} = XB{4-0};
-  let Inst{21-29} = xo;
+  let Inst{14...15} = BL{1...0};
+  let Inst{16...20} = XB{4...0};
+  let Inst{21...29} = xo;
   let Inst{30} = XB{5};
   let Inst{31} = 0;
 }
 
-let Predicates = [IsISAFuture] in {
-  def DMXXEXTFDMR512 : XX3Form_AT3_XABp5_P1<60, 226,
-                                            (outs vsrprc:$XAp, vsrprc:$XBp),
-                                            (ins wacc:$AT),
-                                            "dmxxextfdmr512 $XAp, $XBp, $AT, 0", []> {
+//-------------------------- Instruction definitions -------------------------//
+// Predicate combinations available:
+// [MMA, IsISAFuture]
+// [MMA, PrefixInstrs, IsISAFuture]
+
+let Predicates = [MMA, IsISAFuture] in {
+  def DMXXEXTFDMR512
+      : XX3Form_AT3_XABp5_P1<60, 226, (outs vsrprc:$XAp, vsrprc:$XBp),
+                             (ins wacc:$AT),
+                             "dmxxextfdmr512 $XAp, $XBp, $AT, 0", []> {
     let P = 0;
   }
 
-  def DMXXEXTFDMR512_HI : XX3Form_AT3_XABp5_P1<60, 226,
-                                               (outs vsrprc:$XAp, vsrprc:$XBp),
-                                               (ins wacc_hi:$AT),
-                                               "dmxxextfdmr512 $XAp, $XBp, $AT, 1", []> {
+  def DMXXEXTFDMR512_HI
+      : XX3Form_AT3_XABp5_P1<60, 226, (outs vsrprc:$XAp, vsrprc:$XBp),
+                             (ins wacc_hi:$AT),
+                             "dmxxextfdmr512 $XAp, $XBp, $AT, 1", []> {
     let P = 1;
   }
 
-  def DMXXINSTDMR512 : XX3Form_AT3_XABp5_P1<60, 234, (outs wacc:$AT),
-                                             (ins vsrprc:$XAp, vsrprc:$XBp),
-                                             "dmxxinstdmr512 $AT, $XAp, $XBp, 0", []> {
+  def DMXXINSTDMR512
+      : XX3Form_AT3_XABp5_P1<60, 234, (outs wacc:$AT),
+                             (ins vsrprc:$XAp, vsrprc:$XBp),
+                             "dmxxinstdmr512 $AT, $XAp, $XBp, 0", []> {
     let P = 0;
   }
 
-  def DMXXINSTDMR512_HI : XX3Form_AT3_XABp5_P1<60, 234, (outs wacc_hi:$AT),
-                                                (ins vsrprc:$XAp, vsrprc:$XBp),
-                                                "dmxxinstdmr512 $AT, $XAp, $XBp, 1", []> {
+  def DMXXINSTDMR512_HI
+      : XX3Form_AT3_XABp5_P1<60, 234, (outs wacc_hi:$AT),
+                             (ins vsrprc:$XAp, vsrprc:$XBp),
+                             "dmxxinstdmr512 $AT, $XAp, $XBp, 1", []> {
     let P = 1;
   }
 
@@ -422,236 +445,220 @@ let Predicates = [IsISAFuture] in {
                                            "dmxxextfdmr256 $XBp, $AT, $P", []>;
 
   def DMXXINSTDMR256 : XX2Form_AT3_XBp5_P2<60, 485, (outs dmrrowp:$AT),
-                                            (ins vsrprc:$XBp, u2imm:$P),
-                                            "dmxxinstdmr256 $AT, $XBp, $P", []>;
+                                           (ins vsrprc:$XBp, u2imm:$P),
+                                           "dmxxinstdmr256 $AT, $XBp, $P", []>;
 
-  def DMMR : XForm_ATB3<31, 6, 177, (outs dmr:$AT), (ins dmr:$AB),
-                        "dmmr $AT, $AB", 
-                        [(set v1024i1:$AT, (int_ppc_mma_dmmr v1024i1:$AB))]>;
+  def DMMR
+      : XForm_ATB3<31, 6, 177, (outs dmr:$AT), (ins dmr:$AB), "dmmr $AT, $AB",
+                   [(set v1024i1:$AT, (int_ppc_mma_dmmr v1024i1:$AB))]>;
 
   def DMXOR : XForm_ATB3<31, 7, 177, (outs dmr:$AT), (ins dmr:$ATi, dmr:$AB),
                          "dmxor $AT, $AB",
-                         [(set v1024i1:$AT, (int_ppc_mma_dmxor v1024i1:$ATi, v1024i1:$AB))]>,
-                         RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-
-  def DMSETDMRZ : XForm_AT3<31, 2, 177, (outs dmr:$AT), (ins),
-                            "dmsetdmrz $AT", NoItinerary,
-                            [(set v1024i1:$AT, (int_ppc_mma_dmsetdmrz))]>;
-}
-
-// MMA+ accumulating/non-accumulating instructions.
-
-// DMXVI8GERX4, DMXVI8GERX4PP, PMDMXVI8GERX4,  PMDMXVI8GERX4PP
-defm DMXVI8GERX4 : DMR_UM_M448_XOEO<59, 10, (ins vsrprc:$XAp, vsrc:$XB),
-                               "dmxvi8gerx4", "$AT, $XAp, $XB">;
-
-let Predicates = [MMA, IsISAFuture] in {
-  def DMXVI8GERX4SPP :
-    XX3Form_AT3_XAp5B6<59, 98, (outs dmr:$AT), (ins dmr:$ATi, vsrprc:$XAp, vsrc:$XB),
-                     "dmxvi8gerx4spp $AT, $XAp, $XB", IIC_VecGeneral, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-}
+                         [(set v1024i1:$AT, (int_ppc_mma_dmxor v1024i1:$ATi,
+                                                v1024i1:$AB))]>,
+              RegConstraint<"$ATi = $AT">;
+
+  def DMSETDMRZ
+      : XForm_AT3<31, 2, 177, (outs dmr:$AT), (ins), "dmsetdmrz $AT",
+                  NoItinerary, [(set v1024i1:$AT, (int_ppc_mma_dmsetdmrz))]>;
+
+  // DMXVI8GERX4, DMXVI8GERX4PP, PMDMXVI8GERX4,  PMDMXVI8GERX4PP
+  defm DMXVI8GERX4 : DMR_UM_M448_XOEO<59, 10, (ins vsrprc:$XAp, vsrc:$XB),
+                                      "dmxvi8gerx4", "$AT, $XAp, $XB">;
+
+  // DMXVBF16GERX2, DMXVBF16GERX2PP, DMXVBF16GERX2PN, dMXVBF16GERX2NP,
+  // DMXVBF16GERX2NN PMDMXVBF16GERX2, PMDMXVBF16GERX2PP, PMDMXVBF16GERX2PN,
+  // PMDMXVBF16GERX2NP, PMDMXVBF16GERX2NN
+  defm DMXVBF16GERX2
+      : DMR_NEG_UM_M284_XOXORf939a0<59, 74, (ins vsrprc:$XAp, vsrc:$XB),
+                                    "dmxvbf16gerx2", "$AT, $XAp, $XB">;
+
+  // DMXVF16GERX2, DMXVF16GERX2PP, DMXVF16GERX2PN, dMXVF16GERX2NP,
+  // DMXVF16GERX2NN PMDMXVF16GERX2, PMDMXVF16GERX2PP, PMDMXVF16GERX2PN,
+  // PMDMXVF16GERX2NP, PMDMXVF16GERX2NN
+  defm DMXVF16GERX2
+      : DMR_NEG_UM_M284_XOXORd11188<59, 66, (ins vsrprc:$XAp, vsrc:$XB),
+                                    "dmxvf16gerx2", "$AT, $XAp, $XB">;
+
+  // DMF cryptography [support] Instructions
+  def DMSHA2HASH
+      : XForm_AT3_T1_AB3<
+            31, 14, 177, (outs dmr:$AT), (ins dmr:$ATi, dmr:$AB, u1imm:$T),
+            "dmsha2hash $AT, $AB, $T",
+            [(set v1024i1:$AT, (int_ppc_mma_dmsha2hash v1024i1:$ATi,
+                                   v1024i1:$AB, timm:$T))]>,
+        RegConstraint<"$ATi = $AT">;
+  def DMSHA3HASH
+      : XForm_ATp2_SR5<31, 15, 177, (outs dmrp:$ATp),
+                       (ins dmrp:$ATpi, u5imm:$SR), "dmsha3hash $ATp, $SR",
+                       [(set v2048i1:$ATp,
+                           (int_ppc_mma_dmsha3hash v2048i1:$ATpi, timm:$SR))]>,
+        RegConstraint<"$ATpi = $ATp">;
+  def DMXXSHAPAD
+      : XX2Form_AT3_XB6_ID2_E1_BL2<60, 421, (outs dmr:$AT),
+                                   (ins dmr:$ATi, vsrc:$XB, u2imm:$ID, u1imm:$E,
+                                       u2imm:$BL),
+                                   "dmxxshapad $AT, $XB, $ID, $E, $BL", []>,
+        RegConstraint<"$ATi = $AT">;
+
+  // MMA+ accumulating/non-accumulating instructions.
+  def DMXVI8GERX4SPP
+      : XX3Form_AT3_XAp5B6<59, 98, (outs dmr:$AT),
+                           (ins dmr:$ATi, vsrprc:$XAp, vsrc:$XB),
+                           "dmxvi8gerx4spp $AT, $XAp, $XB", IIC_VecGeneral, []>,
+        RegConstraint<"$ATi = $AT">;
+
+} // End of [MMA, IsISAFuture]
 
 let Predicates = [MMA, PrefixInstrs, IsISAFuture] in {
-  def PMDMXVI8GERX4SPP :
-    MMIRR_XX3Form_X8YP4_XAp5B6<59, 98, (outs dmr:$AT),
-                            (ins dmr:$ATi, vsrprc:$XAp,vsrc:$XB, u8imm:$XMSK,
-                             u4imm:$YMSK, u4imm:$PMSK),
-                            "pmdmxvi8gerx4spp $AT, $XAp, $XB, $XMSK, $YMSK, $PMSK",
-                            IIC_VecGeneral, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def PMDMXVI8GERX4SPP
+      : MMIRR_XX3Form_X8YP4_XAp5B6<
+            59, 98, (outs dmr:$AT),
+            (ins dmr:$ATi, vsrprc:$XAp, vsrc:$XB, u8imm:$XMSK, u4imm:$YMSK,
+                u4imm:$PMSK),
+            "pmdmxvi8gerx4spp $AT, $XAp, $XB, $XMSK, $YMSK, $PMSK",
+            IIC_VecGeneral, []>,
+        RegConstraint<"$ATi = $AT">;
 }
 
-// DMXVBF16GERX2, DMXVBF16GERX2PP, DMXVBF16GERX2PN, dMXVBF16GERX2NP, DMXVBF16GERX2NN
-// PMDMXVBF16GERX2, PMDMXVBF16GERX2PP, PMDMXVBF16GERX2PN, PMDMXVBF16GERX2NP, PMDMXVBF16GERX2NN
-defm DMXVBF16GERX2 : DMR_NEG_UM_M284_XOXORf939a0<59, 74, (ins vsrprc:$XAp, vsrc:$XB),
-                                         "dmxvbf16gerx2", "$AT, $XAp, $XB">;
-
-// DMXVF16GERX2, DMXVF16GERX2PP, DMXVF16GERX2PN, dMXVF16GERX2NP, DMXVF16GERX2NN
-// PMDMXVF16GERX2, PMDMXVF16GERX2PP, PMDMXVF16GERX2PN, PMDMXVF16GERX2NP, PMDMXVF16GERX2NN
-defm DMXVF16GERX2 : DMR_NEG_UM_M284_XOXORd11188<59, 66, (ins vsrprc:$XAp, vsrc:$XB),
-                                         "dmxvf16gerx2", "$AT, $XAp, $XB">;
-
-// DMF cryptography [support] Instructions
-let Predicates = [IsISAFuture] in {
-  def DMSHA2HASH :
-    XForm_AT3_T1_AB3<31, 14, 177, (outs dmr:$AT), (ins dmr:$ATi, dmr:$AB, u1imm:$T),
-                     "dmsha2hash $AT, $AB, $T",
-                     [(set v1024i1:$AT, (int_ppc_mma_dmsha2hash v1024i1:$ATi, v1024i1:$AB, timm:$T))]>,
-                     RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-
-  def DMSHA3HASH :
-    XForm_ATp2_SR5<31, 15, 177, (outs dmrp:$ATp), (ins dmrp:$ATpi , u5imm:$SR),
-                   "dmsha3hash $ATp, $SR",
-                   [(set v2048i1:$ATp, (int_ppc_mma_dmsha3hash v2048i1:$ATpi, timm:$SR))]>,
-                   RegConstraint<"$ATpi = $ATp">, NoEncode<"$ATpi">;
-
-  def DMXXSHAPAD :
-    XX2Form_AT3_XB6_ID2_E1_BL2<60, 421, (outs dmr:$AT),
-                               (ins dmr:$ATi, vsrc:$XB, u2imm:$ID, u1imm:$E, u2imm:$BL),
-                               "dmxxshapad $AT, $XB, $ID, $E, $BL", []>,
-                               RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
-}
+//---------------------------- Anonymous Patterns ----------------------------//
+// Predicate combinations available:
+// [MMA, IsISAFuture]
+// [MMA, PrefixInstrs, IsISAFuture]
 
-// MMA+ Intrinsics
 let Predicates = [MMA, IsISAFuture] in {
+  // MMA+ Intrinsics
   def : Pat<(v1024i1 (int_ppc_mma_dmxvi8gerx4 v256i1:$XAp, v16i8:$XB)),
             (DMXVI8GERX4 $XAp, RCCp.BToVSRC)>;
-  def : Pat<(v1024i1 (int_ppc_mma_dmxvi8gerx4pp v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)),
+  def : Pat<(v1024i1 (int_ppc_mma_dmxvi8gerx4pp v1024i1:$ATi, v256i1:$XAp,
+                v16i8:$XB)),
             (DMXVI8GERX4PP $ATi, $XAp, RCCp.BToVSRC)>;
-
-  def : Pat<(v1024i1 (int_ppc_mma_dmxvi8gerx4spp v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)),
+  def : Pat<(v1024i1 (int_ppc_mma_dmxvi8gerx4spp v1024i1:$ATi, v256i1:$XAp,
+                v16i8:$XB)),
             (DMXVI8GERX4SPP $ATi, $XAp, RCCp.BToVSRC)>;
-
   def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2 v256i1:$XAp, v16i8:$XB)),
             (DMXVBF16GERX2 $XAp, RCCp.BToVSRC)>;
-
-  def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2pp v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)),
+  def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2pp v1024i1:$ATi, v256i1:$XAp,
+                v16i8:$XB)),
             (DMXVBF16GERX2PP $ATi, $XAp, RCCp.BToVSRC)>;
-
-  def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2pn v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)),
+  def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2pn v1024i1:$ATi, v256i1:$XAp,
+                v16i8:$XB)),
             (DMXVBF16GERX2PN $ATi, $XAp, RCCp.BToVSRC)>;
-
-  def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2np v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)),
+  def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2np v1024i1:$ATi, v256i1:$XAp,
+                v16i8:$XB)),
             (DMXVBF16GERX2NP $ATi, $XAp, RCCp.BToVSRC)>;
-
-  def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2nn v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)),
+  def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2nn v1024i1:$ATi, v256i1:$XAp,
+                v16i8:$XB)),
             (DMXVBF16GERX2NN $ATi, $XAp, RCCp.BToVSRC)>;
-
   def : Pat<(v1024i1 (int_ppc_mma_dmxvf16gerx2 v256i1:$XAp, v16i8:$XB)),
             (DMXVF16GERX2 $XAp, RCCp.BToVSRC)>;
-
-  def : Pat<(v1024i1 (int_ppc_mma_dmxvf16gerx2pp v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)),
+  def : Pat<(v1024i1 (int_ppc_mma_dmxvf16gerx2pp v1024i1:$ATi, v256i1:$XAp,
+                v16i8:$XB)),
             (DMXVF16GERX2PP $ATi, $XAp, RCCp.BToVSRC)>;
-
-  def : Pat<(v1024i1 (int_ppc_mma_dmxvf16gerx2pn v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)),
+  def : Pat<(v1024i1 (int_ppc_mma_dmxvf16gerx2pn v1024i1:$ATi, v256i1:$XAp,
+                v16i8:$XB)),
             (DMXVF16GERX2PN $ATi, $XAp, RCCp.BToVSRC)>;
-
-  def : Pat<(v1024i1 (int_ppc_mma_dmxvf16gerx2np v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)),
+  def : Pat<(v1024i1 (int_ppc_mma_dmxvf16gerx2np v1024i1:$ATi, v256i1:$XAp,
+                v16i8:$XB)),
             (DMXVF16GERX2NP $ATi, $XAp, RCCp.BToVSRC)>;
-
-  def : Pat<(v1024i1 (int_ppc_mma_dmxvf16gerx2nn v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)),
+  def : Pat<(v1024i1 (int_ppc_mma_dmxvf16gerx2nn v1024i1:$ATi, v256i1:$XAp,
+                v16i8:$XB)),
             (DMXVF16GERX2NN $ATi, $XAp, RCCp.BToVSRC)>;
+
+  // Cryptography Intrinsic
+  def : Pat<(v1024i1 (int_ppc_mma_dmxxshapad v1024i1:$ATi, v16i8:$XB, timm:$ID,
+                timm:$E, timm:$BL)),
+            (DMXXSHAPAD $ATi, RCCp.BToVSRC, $ID, $E, $BL)>;
 }
 
 let Predicates = [MMA, PrefixInstrs, IsISAFuture] in {
-  def : Pat<(v1024i1 (int_ppc_mma_pmdmxvi8gerx4 v256i1:$XAp, v16i8:$XB, Msk8Imm:$XMSK,
-                                            Msk4Imm:$YMSK, Msk4Imm:$PMSK)),
-            (PMDMXVI8GERX4 $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
-                        Msk4Imm:$YMSK, Msk4Imm:$PMSK)>;
-
-  def : Pat<(v1024i1 (int_ppc_mma_pmdmxvi8gerx4pp v1024i1:$ATi, v256i1:$XAp, v16i8:$XB,
-                                              Msk8Imm:$XMSK, Msk4Imm:$YMSK,
-                                              Msk4Imm:$PMSK)),
+  def : Pat<(v1024i1 (int_ppc_mma_pmdmxvi8gerx4 v256i1:$XAp, v16i8:$XB,
+                Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk4Imm:$PMSK)),
+            (PMDMXVI8GERX4 $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK, Msk4Imm:$YMSK,
+                Msk4Imm:$PMSK)>;
+
+  def : Pat<(v1024i1 (int_ppc_mma_pmdmxvi8gerx4pp v1024i1:$ATi, v256i1:$XAp,
+                v16i8:$XB, Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk4Imm:$PMSK)),
             (PMDMXVI8GERX4PP $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
-                          Msk4Imm:$YMSK, Msk4Imm:$PMSK)>;
+                Msk4Imm:$YMSK, Msk4Imm:$PMSK)>;
 
-  def : Pat<(v1024i1 (int_ppc_mma_pmdmxvi8gerx4spp v1024i1:$ATi, v256i1:$XAp, v16i8:$XB,
-                                               Msk8Imm:$XMSK, Msk4Imm:$YMSK,
-                                               Msk4Imm:$PMSK)),
+  def : Pat<(v1024i1 (int_ppc_mma_pmdmxvi8gerx4spp v1024i1:$ATi, v256i1:$XAp,
+                v16i8:$XB, Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk4Imm:$PMSK)),
             (PMDMXVI8GERX4SPP $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
-                           Msk4Imm:$YMSK, Msk4Imm:$PMSK)>;
+                Msk4Imm:$YMSK, Msk4Imm:$PMSK)>;
 
-  def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2 v256i1:$XAp, v16i8:$XB, Msk8Imm:$XMSK,
-                                            Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
-            (PMDMXVBF16GERX2 $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
-                        Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+  def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2 v256i1:$XAp, v16i8:$XB,
+                Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
+            (PMDMXVBF16GERX2 $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK, Msk4Imm:$YMSK,
+                Msk2Imm:$PMSK)>;
 
-  def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2pp v1024i1:$ATi, v256i1:$XAp, v16i8:$XB,
-                                              Msk8Imm:$XMSK, Msk4Imm:$YMSK,
-                                              Msk2Imm:$PMSK)),
+  def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2pp v1024i1:$ATi, v256i1:$XAp,
+                v16i8:$XB, Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
             (PMDMXVBF16GERX2PP $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
-                          Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+                Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
 
-  def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2pn v1024i1:$ATi, v256i1:$XAp, v16i8:$XB,
-                                               Msk8Imm:$XMSK, Msk4Imm:$YMSK,
-                                               Msk2Imm:$PMSK)),
+  def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2pn v1024i1:$ATi, v256i1:$XAp,
+                v16i8:$XB, Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
             (PMDMXVBF16GERX2PN $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
-                           Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+                Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
 
-  def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2np v1024i1:$ATi, v256i1:$XAp, v16i8:$XB,
-                                              Msk8Imm:$XMSK, Msk4Imm:$YMSK,
-                                              Msk2Imm:$PMSK)),
+  def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2np v1024i1:$ATi, v256i1:$XAp,
+                v16i8:$XB, Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
             (PMDMXVBF16GERX2NP $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
-                          Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+                Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
 
-  def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2nn v1024i1:$ATi, v256i1:$XAp, v16i8:$XB,
-                                               Msk8Imm:$XMSK, Msk4Imm:$YMSK,
-                                               Msk2Imm:$PMSK)),
+  def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2nn v1024i1:$ATi, v256i1:$XAp,
+                v16i8:$XB, Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
             (PMDMXVBF16GERX2NN $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
-                           Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+                Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
 
-  def : Pat<(v1024i1 (int_ppc_mma_pmdmxvf16gerx2 v256i1:$XAp, v16i8:$XB, Msk8Imm:$XMSK,
-                                             Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
-            (PMDMXVF16GERX2 $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
-                          Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+  def : Pat<(v1024i1 (int_ppc_mma_pmdmxvf16gerx2 v256i1:$XAp, v16i8:$XB,
+                Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
+            (PMDMXVF16GERX2 $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK, Msk4Imm:$YMSK,
+                Msk2Imm:$PMSK)>;
 
-  def : Pat<(v1024i1 (int_ppc_mma_pmdmxvf16gerx2pp v1024i1:$ATi, v256i1:$XAp, v16i8:$XB,
-                                              Msk8Imm:$XMSK, Msk4Imm:$YMSK,
-                                              Msk2Imm:$PMSK)),
+  def : Pat<(v1024i1 (int_ppc_mma_pmdmxvf16gerx2pp v1024i1:$ATi, v256i1:$XAp,
+                v16i8:$XB, Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
             (PMDMXVF16GERX2PP $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
-                          Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+                Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
 
-  def : Pat<(v1024i1 (int_ppc_mma_pmdmxvf16gerx2pn v1024i1:$ATi, v256i1:$XAp, v16i8:$XB,
-                                               Msk8Imm:$XMSK, Msk4Imm:$YMSK,
-                                               Msk2Imm:$PMSK)),
+  def : Pat<(v1024i1 (int_ppc_mma_pmdmxvf16gerx2pn v1024i1:$ATi, v256i1:$XAp,
+                v16i8:$XB, Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
             (PMDMXVF16GERX2PN $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
-                           Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+                Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
 
-  def : Pat<(v1024i1 (int_ppc_mma_pmdmxvf16gerx2np v1024i1:$ATi, v256i1:$XAp, v16i8:$XB,
-                                              Msk8Imm:$XMSK, Msk4Imm:$YMSK,
-                                              Msk2Imm:$PMSK)),
+  def : Pat<(v1024i1 (int_ppc_mma_pmdmxvf16gerx2np v1024i1:$ATi, v256i1:$XAp,
+                v16i8:$XB, Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
             (PMDMXVF16GERX2NP $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
-                          Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+                Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
 
-  def : Pat<(v1024i1 (int_ppc_mma_pmdmxvf16gerx2nn v1024i1:$ATi, v256i1:$XAp, v16i8:$XB,
-                                               Msk8Imm:$XMSK, Msk4Imm:$YMSK,
-                                               Msk2Imm:$PMSK)),
+  def : Pat<(v1024i1 (int_ppc_mma_pmdmxvf16gerx2nn v1024i1:$ATi, v256i1:$XAp,
+                v16i8:$XB, Msk8Imm:$XMSK, Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
             (PMDMXVF16GERX2NN $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
-                           Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
-}
-
-// Cryptography Intrinsic
-let Predicates = [IsISAFuture] in {
-  def : Pat<(v1024i1 (int_ppc_mma_dmxxshapad v1024i1:$ATi, v16i8:$XB, timm:$ID,
-                      timm:$E, timm:$BL)), (DMXXSHAPAD $ATi, RCCp.BToVSRC, $ID, $E, $BL)>;
+                Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
 }
 
-// MMA+ Instruction aliases
-let Predicates = [IsISAFuture] in {
-  def : InstAlias<"dmsha256hash $AT, $AB",
-                  (DMSHA2HASH dmr:$AT, dmr:$AB, 0)>;
+//---------------------------- Instruction aliases ---------------------------//
 
-  def : InstAlias<"dmsha512hash $AT, $AB",
-                  (DMSHA2HASH dmr:$AT, dmr:$AB, 1)>;
-
-  def : InstAlias<"dmsha3dw $ATp",
-                  (DMSHA3HASH dmrp:$ATp, 0)>;
-
-  def : InstAlias<"dmcryshash $ATp",
-                  (DMSHA3HASH dmrp:$ATp, 12)>;
-
-  def : InstAlias<"dmxxsha3512pad $AT, $XB, $E",
-                  (DMXXSHAPAD dmr:$AT, vsrc:$XB, 0, u1imm:$E, 0)>;
-
-  def : InstAlias<"dmxxsha3384pad $AT, $XB, $E",
-                  (DMXXSHAPAD dmr:$AT, vsrc:$XB, 0, u1imm:$E, 1)>;
-
-  def : InstAlias<"dmxxsha3256pad $AT, $XB, $E",
-                  (DMXXSHAPAD dmr:$AT, vsrc:$XB, 0, u1imm:$E, 2)>;
-
-  def : InstAlias<"dmxxsha3224pad $AT, $XB, $E",
-                  (DMXXSHAPAD dmr:$AT, vsrc:$XB, 0, u1imm:$E, 3)>;
-
-  def : InstAlias<"dmxxshake256pad $AT, $XB, $E",
-                  (DMXXSHAPAD dmr:$AT, vsrc:$XB, 1, u1imm:$E, 0)>;
-
-  def : InstAlias<"dmxxshake128pad $AT, $XB, $E",
-                  (DMXXSHAPAD dmr:$AT, vsrc:$XB, 1, u1imm:$E, 1)>;
-
-  def : InstAlias<"dmxxsha384512pad $AT, $XB",
-                  (DMXXSHAPAD dmr:$AT, vsrc:$XB, 2, 0, 0)>;
-
-  def : InstAlias<"dmxxsha224256pad $AT, $XB",
-                  (DMXXSHAPAD dmr:$AT, vsrc:$XB, 3, 0, 0)>;
+let Predicates = [MMA, IsISAFuture] in {
+  def : InstAlias<"dmsha256hash $AT, $AB", (DMSHA2HASH dmr:$AT, dmr:$AB, 0)>;
+  def : InstAlias<"dmsha512hash $AT, $AB", (DMSHA2HASH dmr:$AT, dmr:$AB, 1)>;
+  def : InstAlias<"dmsha3dw $ATp", (DMSHA3HASH dmrp:$ATp, 0)>;
+  def : InstAlias<"dmcryshash $ATp", (DMSHA3HASH dmrp:$ATp, 12)>;
+  def : InstAlias<"dmxxsha3512pad $AT, $XB, $E", (DMXXSHAPAD dmr:$AT, vsrc:$XB,
+                                                     0, u1imm:$E, 0)>;
+  def : InstAlias<"dmxxsha3384pad $AT, $XB, $E", (DMXXSHAPAD dmr:$AT, vsrc:$XB,
+                                                     0, u1imm:$E, 1)>;
+  def : InstAlias<"dmxxsha3256pad $AT, $XB, $E", (DMXXSHAPAD dmr:$AT, vsrc:$XB,
+                                                     0, u1imm:$E, 2)>;
+  def : InstAlias<"dmxxsha3224pad $AT, $XB, $E", (DMXXSHAPAD dmr:$AT, vsrc:$XB,
+                                                     0, u1imm:$E, 3)>;
+  def : InstAlias<"dmxxshake256pad $AT, $XB, $E", (DMXXSHAPAD dmr:$AT, vsrc:$XB,
+                                                      1, u1imm:$E, 0)>;
+  def : InstAlias<"dmxxshake128pad $AT, $XB, $E", (DMXXSHAPAD dmr:$AT, vsrc:$XB,
+                                                      1, u1imm:$E, 1)>;
+  def : InstAlias<"dmxxsha384512pad $AT, $XB", (DMXXSHAPAD dmr:$AT, vsrc:$XB, 2,
+                                                   0, 0)>;
+  def : InstAlias<"dmxxsha224256pad $AT, $XB", (DMXXSHAPAD dmr:$AT, vsrc:$XB, 3,
+                                                   0, 0)>;
 }
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 7c1550e99bae..db066bc4b7bd 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -30,6 +30,7 @@
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/StackMaps.h"
@@ -87,8 +88,8 @@ static cl::opt<bool> EnableFMARegPressureReduction(
 // Pin the vtable to this file.
 void PPCInstrInfo::anchor() {}
 
-PPCInstrInfo::PPCInstrInfo(PPCSubtarget &STI)
-    : PPCGenInstrInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP,
+PPCInstrInfo::PPCInstrInfo(const PPCSubtarget &STI)
+    : PPCGenInstrInfo(STI, PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP,
                       /* CatchRetOpcode */ -1,
                       STI.isPPC64() ? PPC::BLR8 : PPC::BLR),
       Subtarget(STI), RI(STI.getTargetMachine()) {}
@@ -1863,6 +1864,48 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
         .addReg(SrcRegSub1)
         .addReg(SrcRegSub1, getKillRegState(KillSrc));
     return;
+  } else if ((PPC::WACCRCRegClass.contains(DestReg) ||
+              PPC::WACC_HIRCRegClass.contains(DestReg)) &&
+             (PPC::WACCRCRegClass.contains(SrcReg) ||
+              PPC::WACC_HIRCRegClass.contains(SrcReg))) {
+
+    Opc = PPC::WACCRCRegClass.contains(SrcReg) ? PPC::DMXXEXTFDMR512
+                                               : PPC::DMXXEXTFDMR512_HI;
+
+    RegScavenger RS;
+    RS.enterBasicBlockEnd(MBB);
+    RS.backward(std::next(I));
+
+    Register TmpReg1 = RS.scavengeRegisterBackwards(PPC::VSRpRCRegClass, I,
+                                                    /* RestoreAfter */ false, 0,
+                                                    /* AllowSpill */ false);
+
+    RS.setRegUsed(TmpReg1);
+    Register TmpReg2 = RS.scavengeRegisterBackwards(PPC::VSRpRCRegClass, I,
+                                                    /* RestoreAfter */ false, 0,
+                                                    /* AllowSpill */ false);
+
+    BuildMI(MBB, I, DL, get(Opc))
+        .addReg(TmpReg1, RegState::Define)
+        .addReg(TmpReg2, RegState::Define)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+
+    Opc = PPC::WACCRCRegClass.contains(DestReg) ? PPC::DMXXINSTDMR512
+                                                : PPC::DMXXINSTDMR512_HI;
+
+    BuildMI(MBB, I, DL, get(Opc), DestReg)
+        .addReg(TmpReg1, RegState::Kill)
+        .addReg(TmpReg2, RegState::Kill);
+
+    return;
+  } else if (PPC::DMRRCRegClass.contains(DestReg) &&
+             PPC::DMRRCRegClass.contains(SrcReg)) {
+
+    BuildMI(MBB, I, DL, get(PPC::DMMR), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+
+    return;
+
   } else
     llvm_unreachable("Impossible reg-to-reg copy");
 
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
index 7931a9e3ae13..63ebd6591057 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
@@ -279,7 +279,7 @@ enum PPCMachineCombinerPattern : unsigned {
 
 class PPCSubtarget;
 class PPCInstrInfo : public PPCGenInstrInfo {
-  PPCSubtarget &Subtarget;
+  const PPCSubtarget &Subtarget;
   const PPCRegisterInfo RI;
   const unsigned StoreSpillOpcodesArray[4][SOK_LastOpcodeSpill] =
       StoreOpcodesForSpill;
@@ -369,7 +369,7 @@ protected:
                                        unsigned OpIdx2) const override;
 
 public:
-  explicit PPCInstrInfo(PPCSubtarget &STI);
+  explicit PPCInstrInfo(const PPCSubtarget &STI);
 
   bool isLoadFromConstantPool(MachineInstr *I) const;
   const Constant *getConstantFromConstantPool(MachineInstr *I) const;
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index c2f91ce8e6b9..c12cf8511312 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -58,6 +58,10 @@ def SDT_PPCVecShift : SDTypeProfile<1, 3, [ SDTCisVec<0>,
   SDTCisVec<1>, SDTCisVec<2>, SDTCisPtrTy<3>
 ]>;
 
+def SDT_PPCVecShiftQuad : SDTypeProfile<1, 2, [
+  SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>
+]>;
+
 def SDT_PPCVecInsert : SDTypeProfile<1, 3, [ SDTCisVec<0>,
   SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>
 ]>;
@@ -157,6 +161,8 @@ def PPCfctiwz : SDNode<"PPCISD::FCTIWZ", SDTFPUnaryOp, []>;
 def PPCfctiduz: SDNode<"PPCISD::FCTIDUZ",SDTFPUnaryOp, []>;
 def PPCfctiwuz: SDNode<"PPCISD::FCTIWUZ",SDTFPUnaryOp, []>;
 
+def PPCvsrq: SDNode<"PPCISD::VSRQ", SDT_PPCVecShiftQuad, []>;
+
 def PPCstrict_fcfid : SDNode<"PPCISD::STRICT_FCFID",
                              SDTFPUnaryOp, [SDNPHasChain]>;
 def PPCstrict_fcfidu : SDNode<"PPCISD::STRICT_FCFIDU",
@@ -665,9 +671,6 @@ class isRecordForm   { bit RC = 1; }
 class RegConstraint<string C> {
   string Constraints = C;
 }
-class NoEncode<string E> {
-  string DisableEncoding = E;
-}
 
 
 // Define PowerPC specific addressing mode.
@@ -1989,29 +1992,24 @@ def LBZU : DForm_1<35, (outs gprc:$RST, ptr_rc_nor0:$ea_result), (ins (memri $D,
 
 def LHAU : DForm_1<43, (outs gprc:$RST, ptr_rc_nor0:$ea_result), (ins (memri $D, $RA):$addr),
                    "lhau $RST, $addr", IIC_LdStLHAU,
-                   []>, RegConstraint<"$addr.reg = $ea_result">,
-                   NoEncode<"$ea_result">;
+                   []>, RegConstraint<"$addr.reg = $ea_result">;
 
 def LHZU : DForm_1<41, (outs gprc:$RST, ptr_rc_nor0:$ea_result), (ins (memri $D, $RA):$addr),
                    "lhzu $RST, $addr", IIC_LdStLoadUpd,
-                   []>, RegConstraint<"$addr.reg = $ea_result">,
-                   NoEncode<"$ea_result">;
+                   []>, RegConstraint<"$addr.reg = $ea_result">;
 
 def LWZU : DForm_1<33, (outs gprc:$RST, ptr_rc_nor0:$ea_result), (ins (memri $D, $RA):$addr),
                    "lwzu $RST, $addr", IIC_LdStLoadUpd,
-                   []>, RegConstraint<"$addr.reg = $ea_result">,
-                   NoEncode<"$ea_result">;
+                   []>, RegConstraint<"$addr.reg = $ea_result">;
 
 let Predicates = [HasFPU] in {
 def LFSU : DForm_1<49, (outs f4rc:$RST, ptr_rc_nor0:$ea_result), (ins (memri $D, $RA):$addr),
                   "lfsu $RST, $addr", IIC_LdStLFDU,
-                  []>, RegConstraint<"$addr.reg = $ea_result">,
-                   NoEncode<"$ea_result">;
+                  []>, RegConstraint<"$addr.reg = $ea_result">;
 
 def LFDU : DForm_1<51, (outs f8rc:$RST, ptr_rc_nor0:$ea_result), (ins (memri $D, $RA):$addr),
                   "lfdu $RST, $addr", IIC_LdStLFDU,
-                  []>, RegConstraint<"$addr.reg = $ea_result">,
-                   NoEncode<"$ea_result">;
+                  []>, RegConstraint<"$addr.reg = $ea_result">;
 }
 
 
@@ -2019,39 +2017,33 @@ def LFDU : DForm_1<51, (outs f8rc:$RST, ptr_rc_nor0:$ea_result), (ins (memri $D,
 def LBZUX : XForm_1_memOp<31, 119, (outs gprc:$RST, ptr_rc_nor0:$ea_result),
                    (ins (memrr $RA, $RB):$addr),
                    "lbzux $RST, $addr", IIC_LdStLoadUpdX,
-                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
-                   NoEncode<"$ea_result">;
+                   []>, RegConstraint<"$addr.ptrreg = $ea_result">;
 
 def LHAUX : XForm_1_memOp<31, 375, (outs gprc:$RST, ptr_rc_nor0:$ea_result),
                    (ins (memrr $RA, $RB):$addr),
                    "lhaux $RST, $addr", IIC_LdStLHAUX,
-                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
-                   NoEncode<"$ea_result">;
+                   []>, RegConstraint<"$addr.ptrreg = $ea_result">;
 
 def LHZUX : XForm_1_memOp<31, 311, (outs gprc:$RST, ptr_rc_nor0:$ea_result),
                    (ins (memrr $RA, $RB):$addr),
                    "lhzux $RST, $addr", IIC_LdStLoadUpdX,
-                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
-                   NoEncode<"$ea_result">;
+                   []>, RegConstraint<"$addr.ptrreg = $ea_result">;
 
 def LWZUX : XForm_1_memOp<31, 55, (outs gprc:$RST, ptr_rc_nor0:$ea_result),
                    (ins (memrr $RA, $RB):$addr),
                    "lwzux $RST, $addr", IIC_LdStLoadUpdX,
-                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
-                   NoEncode<"$ea_result">;
+                   []>, RegConstraint<"$addr.ptrreg = $ea_result">;
 
 let Predicates = [HasFPU] in {
 def LFSUX : XForm_1_memOp<31, 567, (outs f4rc:$RST, ptr_rc_nor0:$ea_result),
                    (ins (memrr $RA, $RB):$addr),
                    "lfsux $RST, $addr", IIC_LdStLFDUX,
-                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
-                   NoEncode<"$ea_result">;
+                   []>, RegConstraint<"$addr.ptrreg = $ea_result">;
 
 def LFDUX : XForm_1_memOp<31, 631, (outs f8rc:$RST, ptr_rc_nor0:$ea_result),
                    (ins (memrr $RA, $RB):$addr),
                    "lfdux $RST, $addr", IIC_LdStLFDUX,
-                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
-                   NoEncode<"$ea_result">;
+                   []>, RegConstraint<"$addr.ptrreg = $ea_result">;
 }
 }
 }
@@ -2132,20 +2124,20 @@ def STFD : DForm_1<54, (outs), (ins f8rc:$RST, (memri $D, $RA):$dst),
 let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in {
 def STBU  : DForm_1<39, (outs ptr_rc_nor0:$ea_res), (ins gprc:$RST, (memri $D, $RA):$dst),
                     "stbu $RST, $dst", IIC_LdStSTU, []>,
-                    RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+                    RegConstraint<"$dst.reg = $ea_res">;
 def STHU  : DForm_1<45, (outs ptr_rc_nor0:$ea_res), (ins gprc:$RST, (memri $D, $RA):$dst),
                     "sthu $RST, $dst", IIC_LdStSTU, []>,
-                    RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+                    RegConstraint<"$dst.reg = $ea_res">;
 def STWU  : DForm_1<37, (outs ptr_rc_nor0:$ea_res), (ins gprc:$RST, (memri $D, $RA):$dst),
                     "stwu $RST, $dst", IIC_LdStSTU, []>,
-                    RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+                    RegConstraint<"$dst.reg = $ea_res">;
 let Predicates = [HasFPU] in {
 def STFSU : DForm_1<53, (outs ptr_rc_nor0:$ea_res), (ins f4rc:$RST, (memri $D, $RA):$dst),
                     "stfsu $RST, $dst", IIC_LdStSTFDU, []>,
-                    RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+                    RegConstraint<"$dst.reg = $ea_res">;
 def STFDU : DForm_1<55, (outs ptr_rc_nor0:$ea_res), (ins f8rc:$RST, (memri $D, $RA):$dst),
                     "stfdu $RST, $dst", IIC_LdStSTFDU, []>,
-                    RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+                    RegConstraint<"$dst.reg = $ea_res">;
 }
 }
 
@@ -2207,32 +2199,27 @@ def STBUX : XForm_8_memOp<31, 247, (outs ptr_rc_nor0:$ea_res),
                           (ins gprc:$RST, (memrr $RA, $RB):$addr),
                           "stbux $RST, $addr", IIC_LdStSTUX, []>,
                           RegConstraint<"$addr.ptrreg = $ea_res">,
-                          NoEncode<"$ea_res">,
                           PPC970_DGroup_Cracked;
 def STHUX : XForm_8_memOp<31, 439, (outs ptr_rc_nor0:$ea_res),
                           (ins gprc:$RST, (memrr $RA, $RB):$addr),
                           "sthux $RST, $addr", IIC_LdStSTUX, []>,
                           RegConstraint<"$addr.ptrreg = $ea_res">,
-                          NoEncode<"$ea_res">,
                           PPC970_DGroup_Cracked;
 def STWUX : XForm_8_memOp<31, 183, (outs ptr_rc_nor0:$ea_res),
                           (ins gprc:$RST, (memrr $RA, $RB):$addr),
                           "stwux $RST, $addr", IIC_LdStSTUX, []>,
                           RegConstraint<"$addr.ptrreg = $ea_res">,
-                          NoEncode<"$ea_res">,
                           PPC970_DGroup_Cracked;
 let Predicates = [HasFPU] in {
 def STFSUX: XForm_8_memOp<31, 695, (outs ptr_rc_nor0:$ea_res),
                           (ins f4rc:$RST, (memrr $RA, $RB):$addr),
                           "stfsux $RST, $addr", IIC_LdStSTFDU, []>,
                           RegConstraint<"$addr.ptrreg = $ea_res">,
-                          NoEncode<"$ea_res">,
                           PPC970_DGroup_Cracked;
 def STFDUX: XForm_8_memOp<31, 759, (outs ptr_rc_nor0:$ea_res),
                           (ins f8rc:$RST, (memrr $RA, $RB):$addr),
                           "stfdux $RST, $addr", IIC_LdStSTFDU, []>,
                           RegConstraint<"$addr.ptrreg = $ea_res">,
-                          NoEncode<"$ea_res">,
                           PPC970_DGroup_Cracked;
 }
 }
@@ -3099,7 +3086,7 @@ defm RLWIMI : MForm_2r<20, (outs gprc:$RA),
                        (ins gprc:$RAi, gprc:$RS, u5imm:$SH, u5imm:$MB,
                        u5imm:$ME), "rlwimi", "$RA, $RS, $SH, $MB, $ME",
                        IIC_IntRotate, []>, PPC970_DGroup_Cracked,
-                       RegConstraint<"$RAi = $RA">, NoEncode<"$RAi">;
+                       RegConstraint<"$RAi = $RA">;
 }
 let BaseName = "rlwinm" in {
 def RLWINM : MForm_2<21,
@@ -3235,9 +3222,10 @@ def PPC32GOT: PPCEmitTimePseudo<(outs gprc:$rD), (ins), "#PPC32GOT",
 
 // Get the _GLOBAL_OFFSET_TABLE_ in PIC mode.
 // This uses two output registers, the first as the real output, the second as a
-// temporary register, used internally in code generation.
+// temporary register, used internally in code generation. A "bl" also clobbers LR.
+let Defs = [LR] in
 def PPC32PICGOT: PPCEmitTimePseudo<(outs gprc:$rD, gprc:$rT), (ins), "#PPC32PICGOT",
-                []>, NoEncode<"$rT">;
+                []>;
 
 def LDgotTprelL32: PPCEmitTimePseudo<(outs gprc_nor0:$rD), (ins s16imm:$disp, gprc_nor0:$reg),
                            "#LDgotTprelL32",
@@ -4287,7 +4275,7 @@ def WRTEEI: I<31, (outs), (ins i1imm:$E), "wrteei $E", IIC_SprMTMSR>,
   bits<1> E;
 
   let Inst{16} = E;
-  let Inst{21-30} = 163;
+  let Inst{21...30} = 163;
 }
 
 def DCCCI : XForm_tlb<454, (outs), (ins gprc:$RA, gprc:$RB),
@@ -4967,44 +4955,44 @@ defm : BranchSimpleMnemonic1<"dzf", "", 2>;
 
 multiclass BranchExtendedMnemonicPM<string name, string pm, int bibo> {
   def : InstAlias<"b"#name#pm#" $cc, $dst",
-                  (BCC bibo, crrc:$cc, condbrtarget:$dst)>;
+                  (BCC (pred bibo, crrc:$cc), condbrtarget:$dst)>;
   def : InstAlias<"b"#name#pm#" $dst",
-                  (BCC bibo, CR0, condbrtarget:$dst)>;
+                  (BCC (pred bibo, CR0), condbrtarget:$dst)>;
 
   def : InstAlias<"b"#name#"a"#pm#" $cc, $dst",
-                  (BCCA bibo, crrc:$cc, abscondbrtarget:$dst)>;
+                  (BCCA (pred bibo, crrc:$cc), abscondbrtarget:$dst)>;
   def : InstAlias<"b"#name#"a"#pm#" $dst",
-                  (BCCA bibo, CR0, abscondbrtarget:$dst)>;
+                  (BCCA (pred bibo, CR0), abscondbrtarget:$dst)>;
 
   def : InstAlias<"b"#name#"lr"#pm#" $cc",
-                  (BCCLR bibo, crrc:$cc)>;
+                  (BCCLR (pred bibo, crrc:$cc))>;
   def : InstAlias<"b"#name#"lr"#pm,
-                  (BCCLR bibo, CR0)>;
+                  (BCCLR (pred bibo, CR0))>;
 
   def : InstAlias<"b"#name#"ctr"#pm#" $cc",
-                  (BCCCTR bibo, crrc:$cc)>;
+                  (BCCCTR (pred bibo, crrc:$cc))>;
   def : InstAlias<"b"#name#"ctr"#pm,
-                  (BCCCTR bibo, CR0)>;
+                  (BCCCTR (pred bibo, CR0))>;
 
   def : InstAlias<"b"#name#"l"#pm#" $cc, $dst",
-                  (BCCL bibo, crrc:$cc, condbrtarget:$dst)>;
+                  (BCCL (pred bibo, crrc:$cc), condbrtarget:$dst)>;
   def : InstAlias<"b"#name#"l"#pm#" $dst",
-                  (BCCL bibo, CR0, condbrtarget:$dst)>;
+                  (BCCL (pred bibo, CR0), condbrtarget:$dst)>;
 
   def : InstAlias<"b"#name#"la"#pm#" $cc, $dst",
-                  (BCCLA bibo, crrc:$cc, abscondbrtarget:$dst)>;
+                  (BCCLA (pred bibo, crrc:$cc), abscondbrtarget:$dst)>;
   def : InstAlias<"b"#name#"la"#pm#" $dst",
-                  (BCCLA bibo, CR0, abscondbrtarget:$dst)>;
+                  (BCCLA (pred bibo, CR0), abscondbrtarget:$dst)>;
 
   def : InstAlias<"b"#name#"lrl"#pm#" $cc",
-                  (BCCLRL bibo, crrc:$cc)>;
+                  (BCCLRL (pred bibo, crrc:$cc))>;
   def : InstAlias<"b"#name#"lrl"#pm,
-                  (BCCLRL bibo, CR0)>;
+                  (BCCLRL (pred bibo, CR0))>;
 
   def : InstAlias<"b"#name#"ctrl"#pm#" $cc",
-                  (BCCCTRL bibo, crrc:$cc)>;
+                  (BCCCTRL (pred bibo, crrc:$cc))>;
   def : InstAlias<"b"#name#"ctrl"#pm,
-                  (BCCCTRL bibo, CR0)>;
+                  (BCCCTRL (pred bibo, CR0))>;
 }
 multiclass BranchExtendedMnemonic<string name, int bibo> {
   defm : BranchExtendedMnemonicPM<name, "", bibo>;
diff --git a/llvm/lib/Target/PowerPC/PPCInstrMMA.td b/llvm/lib/Target/PowerPC/PPCInstrMMA.td
index 436715a0e4ab..b38dd4ae948c 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrMMA.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrMMA.td
@@ -14,7 +14,7 @@ multiclass ACC_UM_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
   def PP :
     XX3Form_AT3_XAB6<opcode, xo, (outs acc:$AT), !con((ins acc:$ATi), IOL),
                      !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    RegConstraint<"$ATi = $AT">;
   }
   let Predicates = [MMA, IsISAFuture], isCodeGenOnly = 1 in {
   def NAME#W :
@@ -24,7 +24,7 @@ multiclass ACC_UM_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
   def WPP :
     XX3Form_AT3_XAB6<opcode, xo, (outs wacc:$AT), !con((ins wacc:$ATi), IOL),
                      !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    RegConstraint<"$ATi = $AT">;
   }
 }
 
@@ -48,7 +48,7 @@ multiclass ACC_UM_M844_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
            !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u8imm:$PMSK))),
       !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
       IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    RegConstraint<"$ATi = $AT">;
   }
   let Predicates = [MMA, PrefixInstrs, IsISAFuture], isCodeGenOnly = 1 in {
   def PM#NAME#W :
@@ -65,7 +65,7 @@ multiclass ACC_UM_M844_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
            !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u8imm:$PMSK))),
       !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
       IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    RegConstraint<"$ATi = $AT">;
   }
 }
 
@@ -89,7 +89,7 @@ multiclass ACC_UM_M444_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
            !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u4imm:$PMSK))),
       !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
       IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    RegConstraint<"$ATi = $AT">;
   }
   let Predicates = [MMA, PrefixInstrs, IsISAFuture], isCodeGenOnly = 1 in {
   def PM#NAME#W :
@@ -106,7 +106,7 @@ multiclass ACC_UM_M444_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
            !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u4imm:$PMSK))),
       !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
       IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    RegConstraint<"$ATi = $AT">;
   }
 }
 
@@ -129,7 +129,7 @@ multiclass ACC_UM_M244_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
       !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
       !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
       IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    RegConstraint<"$ATi = $AT">;
   }
   let Predicates = [MMA, PrefixInstrs, IsISAFuture], isCodeGenOnly = 1 in {
   def PM#NAME#W :
@@ -145,7 +145,7 @@ multiclass ACC_UM_M244_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
       !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
       !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
       IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    RegConstraint<"$ATi = $AT">;
   }
 }
 
@@ -162,7 +162,7 @@ multiclass ACC_UM_M244_XO46<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
     XX3Form_AT3_XAB6<
       opcode, !or(xo, 0x20), (outs acc:$AT), !con((ins acc:$ATi), IOL),
       !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    RegConstraint<"$ATi = $AT">;
   }
   let Predicates = [MMA, PrefixInstrs, IsNotISAFuture] in {
   def PM#NAME :
@@ -179,7 +179,7 @@ multiclass ACC_UM_M244_XO46<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
            !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
       !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
       IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    RegConstraint<"$ATi = $AT">;
   }
   let Predicates = [MMA, IsISAFuture], isCodeGenOnly = 1 in {
   def NAME#W :
@@ -190,7 +190,7 @@ multiclass ACC_UM_M244_XO46<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
     XX3Form_AT3_XAB6<
       opcode, !or(xo, 0x20), (outs wacc:$AT), !con((ins wacc:$ATi), IOL),
       !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    RegConstraint<"$ATi = $AT">;
   }
   let Predicates = [MMA, PrefixInstrs, IsISAFuture], isCodeGenOnly = 1 in {
   def PM#NAME#W :
@@ -207,7 +207,7 @@ multiclass ACC_UM_M244_XO46<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
            !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
       !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
       IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    RegConstraint<"$ATi = $AT">;
   }
 }
 
@@ -220,29 +220,29 @@ multiclass ACC_NEG_UM_M244_XOM84C<bits<6> opcode, bits<8> xo, dag IOL,
   def PN : XX3Form_AT3_XAB6<
              opcode, !or(xo, 0x80), (outs acc:$AT), !con((ins acc:$ATi), IOL),
              !strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>,
-           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+           RegConstraint<"$ATi = $AT">;
   def NP : XX3Form_AT3_XAB6<
              opcode, !or(xo, 0x40), (outs acc:$AT), !con((ins acc:$ATi), IOL),
              !strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>,
-           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+           RegConstraint<"$ATi = $AT">;
   def NN : XX3Form_AT3_XAB6<
              opcode, !or(xo, 0xC0), (outs acc:$AT), !con((ins acc:$ATi), IOL),
              !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>,
-           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+           RegConstraint<"$ATi = $AT">;
   }
   let Predicates = [MMA, IsISAFuture], isCodeGenOnly = 1 in {
   def WPN : XX3Form_AT3_XAB6<
               opcode, !or(xo, 0x80), (outs wacc:$AT), !con((ins wacc:$ATi), IOL),
               !strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>,
-           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+           RegConstraint<"$ATi = $AT">;
   def WNP : XX3Form_AT3_XAB6<
               opcode, !or(xo, 0x40), (outs wacc:$AT), !con((ins wacc:$ATi), IOL),
               !strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>,
-           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+           RegConstraint<"$ATi = $AT">;
   def WNN : XX3Form_AT3_XAB6<
               opcode, !or(xo, 0xC0), (outs wacc:$AT), !con((ins wacc:$ATi), IOL),
               !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>,
-           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+           RegConstraint<"$ATi = $AT">;
   }
   let Predicates = [MMA, PrefixInstrs, IsNotISAFuture] in {
   def PM#NAME#PN :
@@ -251,21 +251,21 @@ multiclass ACC_NEG_UM_M244_XOM84C<bits<6> opcode, bits<8> xo, dag IOL,
       !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
       !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK, $PMSK"),
       IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    RegConstraint<"$ATi = $AT">;
   def PM#NAME#NP :
     MMIRR_XX3Form_XY4P2_XAB6<
       opcode, !or(xo, 0x40), (outs acc:$AT),
       !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
       !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK, $PMSK"),
       IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    RegConstraint<"$ATi = $AT">;
   def PM#NAME#NN :
     MMIRR_XX3Form_XY4P2_XAB6<
       opcode, !or(xo, 0xC0), (outs acc:$AT),
       !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
       !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK, $PMSK"),
       IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    RegConstraint<"$ATi = $AT">;
   }
   let Predicates = [MMA, PrefixInstrs, IsISAFuture], isCodeGenOnly = 1 in {
   def PM#NAME#WPN :
@@ -274,21 +274,21 @@ multiclass ACC_NEG_UM_M244_XOM84C<bits<6> opcode, bits<8> xo, dag IOL,
       !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
       !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK, $PMSK"),
       IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    RegConstraint<"$ATi = $AT">;
   def PM#NAME#WNP :
     MMIRR_XX3Form_XY4P2_XAB6<
       opcode, !or(xo, 0x40), (outs wacc:$AT),
       !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
       !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK, $PMSK"),
       IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    RegConstraint<"$ATi = $AT">;
   def PM#NAME#WNN :
     MMIRR_XX3Form_XY4P2_XAB6<
       opcode, !or(xo, 0xC0), (outs wacc:$AT),
       !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
       !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK, $PMSK"),
       IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    RegConstraint<"$ATi = $AT">;
   }
 }
 
@@ -301,29 +301,29 @@ multiclass ACC_NEG_UM_XOM84C<bits<6> opcode, bits<8> xo, dag IOL,
   def PN : XX3Form_AT3_XAB6<opcode, !or(xo, 0x80), (outs acc:$AT),
                             !con((ins acc:$ATi), IOL),
                             !strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>,
-           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+           RegConstraint<"$ATi = $AT">;
   def NP : XX3Form_AT3_XAB6<opcode, !or(xo, 0x40), (outs acc:$AT),
                             !con((ins acc:$ATi), IOL),
                             !strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>,
-           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+           RegConstraint<"$ATi = $AT">;
   def NN : XX3Form_AT3_XAB6<opcode, !or(xo, 0xC0), (outs acc:$AT),
                             !con((ins acc:$ATi), IOL),
                             !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>,
-           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+           RegConstraint<"$ATi = $AT">;
   }
   let Predicates = [MMA, IsISAFuture], isCodeGenOnly = 1 in {
   def WPN : XX3Form_AT3_XAB6<opcode, !or(xo, 0x80), (outs wacc:$AT),
                             !con((ins wacc:$ATi), IOL),
                             !strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>,
-           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+           RegConstraint<"$ATi = $AT">;
   def WNP : XX3Form_AT3_XAB6<opcode, !or(xo, 0x40), (outs wacc:$AT),
                             !con((ins wacc:$ATi), IOL),
                             !strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>,
-           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+           RegConstraint<"$ATi = $AT">;
   def WNN : XX3Form_AT3_XAB6<opcode, !or(xo, 0xC0), (outs wacc:$AT),
                             !con((ins wacc:$ATi), IOL),
                             !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>,
-           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+           RegConstraint<"$ATi = $AT">;
   }
 }
 
@@ -346,28 +346,28 @@ multiclass ACC_NEG_UM_M44_XOM84C<bits<6> opcode, bits<8> xo, dag IOL,
       !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))),
       !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK"),
       IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    RegConstraint<"$ATi = $AT">;
   def PM#NAME#PN :
     MMIRR_XX3Form_XY4_XAB6<
       opcode, !or(xo, 0x80), (outs acc:$AT),
       !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))),
       !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK"),
       IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    RegConstraint<"$ATi = $AT">;
   def PM#NAME#NP :
     MMIRR_XX3Form_XY4_XAB6<
       opcode, !or(xo, 0x40), (outs acc:$AT),
       !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))),
       !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK"),
       IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    RegConstraint<"$ATi = $AT">;
   def PM#NAME#NN :
     MMIRR_XX3Form_XY4_XAB6<
       opcode, !or(xo, 0xC0), (outs acc:$AT),
       !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))),
       !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK"),
       IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    RegConstraint<"$ATi = $AT">;
   }
   let Predicates = [MMA, PrefixInstrs, IsISAFuture], isCodeGenOnly = 1 in {
   def PM#NAME#W :
@@ -383,28 +383,28 @@ multiclass ACC_NEG_UM_M44_XOM84C<bits<6> opcode, bits<8> xo, dag IOL,
       !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))),
       !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK"),
       IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    RegConstraint<"$ATi = $AT">;
   def PM#NAME#WPN :
     MMIRR_XX3Form_XY4_XAB6<
       opcode, !or(xo, 0x80), (outs wacc:$AT),
       !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))),
       !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK"),
       IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    RegConstraint<"$ATi = $AT">;
   def PM#NAME#WNP :
     MMIRR_XX3Form_XY4_XAB6<
       opcode, !or(xo, 0x40), (outs wacc:$AT),
       !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))),
       !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK"),
       IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    RegConstraint<"$ATi = $AT">;
   def PM#NAME#WNN :
     MMIRR_XX3Form_XY4_XAB6<
       opcode, !or(xo, 0xC0), (outs wacc:$AT),
       !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))),
       !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK"),
       IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    RegConstraint<"$ATi = $AT">;
   }
 }
 
@@ -427,28 +427,28 @@ multiclass ACC_NEG_UM_M42_XOM84C<bits<6> opcode, bits<8> xo, dag IOL,
       !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))),
       !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK"),
       IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    RegConstraint<"$ATi = $AT">;
   def PM#NAME#PN :
     MMIRR_XX3Form_X4Y2_XAB6<
       opcode, !or(xo, 0x80), (outs acc:$AT),
       !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))),
       !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK"),
       IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    RegConstraint<"$ATi = $AT">;
   def PM#NAME#NP :
     MMIRR_XX3Form_X4Y2_XAB6<
       opcode, !or(xo, 0x40), (outs acc:$AT),
       !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))),
       !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK"),
       IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    RegConstraint<"$ATi = $AT">;
   def PM#NAME#NN :
     MMIRR_XX3Form_X4Y2_XAB6<
       opcode, !or(xo, 0xC0), (outs acc:$AT),
       !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))),
       !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK"),
       IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    RegConstraint<"$ATi = $AT">;
   }
   let Predicates = [MMA, PrefixInstrs, IsISAFuture], isCodeGenOnly = 1 in {
   def PM#NAME#W :
@@ -464,28 +464,28 @@ multiclass ACC_NEG_UM_M42_XOM84C<bits<6> opcode, bits<8> xo, dag IOL,
       !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))),
       !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK"),
       IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    RegConstraint<"$ATi = $AT">;
   def PM#NAME#WPN :
     MMIRR_XX3Form_X4Y2_XAB6<
       opcode, !or(xo, 0x80), (outs wacc:$AT),
       !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))),
       !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK"),
       IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    RegConstraint<"$ATi = $AT">;
   def PM#NAME#WNP :
     MMIRR_XX3Form_X4Y2_XAB6<
       opcode, !or(xo, 0x40), (outs wacc:$AT),
       !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))),
       !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK"),
       IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    RegConstraint<"$ATi = $AT">;
   def PM#NAME#WNN :
     MMIRR_XX3Form_X4Y2_XAB6<
       opcode, !or(xo, 0xC0), (outs wacc:$AT),
       !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))),
       !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK"),
       IIC_VecFP, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    RegConstraint<"$ATi = $AT">;
   }
 }
 
@@ -497,12 +497,12 @@ let Predicates = [MMA, IsNotISAFuture] in {
     XForm_AT3<31, 0, 177, (outs acc:$ATo), (ins acc:$AT), "xxmfacc $AT",
               IIC_VecGeneral,
               [(set v512i1:$ATo, (int_ppc_mma_xxmfacc v512i1:$AT))]>,
-              RegConstraint<"$ATo = $AT">, NoEncode<"$ATo">;
+              RegConstraint<"$ATo = $AT">;
   def XXMTACC :
     XForm_AT3<31, 1, 177, (outs acc:$AT), (ins acc:$ATi), "xxmtacc $AT",
               IIC_VecGeneral,
               [(set v512i1:$AT, (int_ppc_mma_xxmtacc v512i1:$ATi))]>,
-              RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+              RegConstraint<"$ATi = $AT">;
   def KILL_PAIR : PPCPostRAExpPseudo<(outs vsrprc:$XTp), (ins vsrprc:$XSp),
                                       "#KILL_PAIR", []>,
                                       RegConstraint<"$XTp = $XSp">;
@@ -519,7 +519,7 @@ let Predicates = [MMA, IsNotISAFuture] in {
   def XVI8GER4SPP :
     XX3Form_AT3_XAB6<59, 99, (outs acc:$AT), (ins acc:$ATi, vsrc:$XA, vsrc:$XB),
                      "xvi8ger4spp $AT, $XA, $XB", IIC_VecGeneral, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    RegConstraint<"$ATi = $AT">;
   let mayStore = 1 in {
     def SPILL_ACC: PPCEmitTimePseudo<(outs), (ins acc:$AT, memrix16:$dst),
                                      "#SPILL_ACC", []>;
@@ -544,11 +544,11 @@ let Predicates = [MMA, IsISAFuture], isCodeGenOnly = 1 in {
   def XXMFACCW :
     XForm_AT3<31, 0, 177, (outs wacc:$ATo), (ins wacc:$AT), "xxmfacc $AT",
               IIC_VecGeneral, []>,
-              RegConstraint<"$ATo = $AT">, NoEncode<"$ATo">;
+              RegConstraint<"$ATo = $AT">;
   def XXMTACCW :
     XForm_AT3<31, 1, 177, (outs wacc:$AT), (ins wacc:$ATi), "xxmtacc $AT",
               IIC_VecGeneral, []>,
-              RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+              RegConstraint<"$ATi = $AT">;
 
   let isAsCheapAsAMove = 1, isReMaterializable = 1 in {
     def DMXXSETACCZ :
@@ -560,7 +560,7 @@ let Predicates = [MMA, IsISAFuture], isCodeGenOnly = 1 in {
     XX3Form_AT3_XAB6<59, 99, (outs wacc:$AT),
                      (ins wacc:$ATi, vsrc:$XA, vsrc:$XB),
                      "xvi8ger4spp $AT, $XA, $XB", IIC_VecGeneral, []>,
-                     RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+                     RegConstraint<"$ATi = $AT">;
 
   let mayStore = 1 in {
     def SPILL_WACC: PPCEmitTimePseudo<(outs), (ins wacc:$AT, memrix16:$dst),
@@ -593,7 +593,7 @@ let Predicates = [MMA, PrefixInstrs, IsNotISAFuture] in {
                              u4imm:$YMSK, u4imm:$PMSK),
                             "pmxvi8ger4spp $AT, $XA, $XB, $XMSK, $YMSK, $PMSK",
                             IIC_VecGeneral, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    RegConstraint<"$ATi = $AT">;
 }
 
 let Predicates = [MMA, PrefixInstrs, IsISAFuture], isCodeGenOnly = 1 in {
@@ -603,7 +603,7 @@ let Predicates = [MMA, PrefixInstrs, IsISAFuture], isCodeGenOnly = 1 in {
                              u4imm:$YMSK, u4imm:$PMSK),
                             "pmxvi8ger4spp $AT, $XA, $XB, $XMSK, $YMSK, $PMSK",
                             IIC_VecGeneral, []>,
-    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+    RegConstraint<"$ATi = $AT">;
 }
 
 // MMA accumulating/non-accumulating instructions.
diff --git a/llvm/lib/Target/PowerPC/PPCInstrP10.td b/llvm/lib/Target/PowerPC/PPCInstrP10.td
index c4a027d65b66..149a44ddfc10 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrP10.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrP10.td
@@ -125,8 +125,8 @@ class PI<bits<6> pref, bits<6> opcode, dag OOL, dag IOL, string asmstr,
   let InOperandList = IOL;
   let AsmString = asmstr;
   let Itinerary = itin;
-  let Inst{0-5} = pref;
-  let Inst{32-37} = opcode;
+  let Inst{0...5} = pref;
+  let Inst{32...37} = opcode;
 
   bits<1> PPC970_First = 0;
   bits<1> PPC970_Single = 0;
@@ -138,7 +138,7 @@ class PI<bits<6> pref, bits<6> opcode, dag OOL, dag IOL, string asmstr,
   let TSFlags{0}   = PPC970_First;
   let TSFlags{1}   = PPC970_Single;
   let TSFlags{2}   = PPC970_Cracked;
-  let TSFlags{5-3} = PPC970_Unit;
+  let TSFlags{5...3} = PPC970_Unit;
 
   bits<1> Prefixed = 1;  // This is a prefixed instruction.
   let TSFlags{7}  = Prefixed;
@@ -167,11 +167,11 @@ class VXForm_VTB5_RC<bits<10> xo, bits<5> R, dag OOL, dag IOL, string asmstr,
 
   let Pattern = pattern;
 
-  let Inst{6-10} = VT;
-  let Inst{11-15} = R;
-  let Inst{16-20} = VB;
+  let Inst{6...10} = VT;
+  let Inst{11...15} = R;
+  let Inst{16...20} = VB;
   let Inst{21} = RC;
-  let Inst{22-31} = xo;
+  let Inst{22...31} = xo;
 }
 
 // Multiclass definition to account for record and non-record form
@@ -200,16 +200,16 @@ class MLS_DForm_R_SI34_RTA5_MEM<bits<6> opcode, dag OOL, dag IOL, string asmstr,
   let Pattern = pattern;
 
   // The prefix.
-  let Inst{6-7} = 2;
-  let Inst{8-10} = 0;
+  let Inst{6...7} = 2;
+  let Inst{8...10} = 0;
   let Inst{11} = PCRel;
-  let Inst{12-13} = 0;
-  let Inst{14-31} = D{33-16}; // d0
+  let Inst{12...13} = 0;
+  let Inst{14...31} = D{33...16}; // d0
 
   // The instruction.
-  let Inst{38-42} = RST{4-0};
-  let Inst{43-47} = RA;
-  let Inst{48-63} = D{15-0}; // d1
+  let Inst{38...42} = RST{4...0};
+  let Inst{43...47} = RA;
+  let Inst{48...63} = D{15...0}; // d1
 }
 
 class MLS_DForm_R_SI34_RTA5<bits<6> opcode, dag OOL, dag IOL, string asmstr,
@@ -222,16 +222,16 @@ class MLS_DForm_R_SI34_RTA5<bits<6> opcode, dag OOL, dag IOL, string asmstr,
   let Pattern = pattern;
 
   // The prefix.
-  let Inst{6-7} = 2;
-  let Inst{8-10} = 0;
+  let Inst{6...7} = 2;
+  let Inst{8...10} = 0;
   let Inst{11} = PCRel;
-  let Inst{12-13} = 0;
-  let Inst{14-31} = SI{33-16};
+  let Inst{12...13} = 0;
+  let Inst{14...31} = SI{33...16};
 
   // The instruction.
-  let Inst{38-42} = RT;
-  let Inst{43-47} = RA;
-  let Inst{48-63} = SI{15-0};
+  let Inst{38...42} = RT;
+  let Inst{43...47} = RA;
+  let Inst{48...63} = SI{15...0};
 }
 
 class MLS_DForm_SI34_RT5<bits<6> opcode, dag OOL, dag IOL, string asmstr,
@@ -243,16 +243,16 @@ class MLS_DForm_SI34_RT5<bits<6> opcode, dag OOL, dag IOL, string asmstr,
   let Pattern = pattern;
 
   // The prefix.
-  let Inst{6-7} = 2;
-  let Inst{8-10} = 0;
+  let Inst{6...7} = 2;
+  let Inst{8...10} = 0;
   let Inst{11} = 0;
-  let Inst{12-13} = 0;
-  let Inst{14-31} = SI{33-16};
+  let Inst{12...13} = 0;
+  let Inst{14...31} = SI{33...16};
 
   // The instruction.
-  let Inst{38-42} = RT;
-  let Inst{43-47} = 0;
-  let Inst{48-63} = SI{15-0};
+  let Inst{38...42} = RT;
+  let Inst{43...47} = 0;
+  let Inst{48...63} = SI{15...0};
 }
 
 multiclass MLS_DForm_R_SI34_RTA5_p<bits<6> opcode, dag OOL, dag IOL,
@@ -274,15 +274,15 @@ class 8LS_DForm_R_SI34_RTA5_MEM<bits<6> opcode, dag OOL, dag IOL, string asmstr,
   let Pattern = pattern;
 
   // The prefix.
-  let Inst{6-10} = 0;
+  let Inst{6...10} = 0;
   let Inst{11} = PCRel;
-  let Inst{12-13} = 0;
-  let Inst{14-31} = D{33-16}; // d0
+  let Inst{12...13} = 0;
+  let Inst{14...31} = D{33...16}; // d0
 
   // The instruction.
-  let Inst{38-42} = RST{4-0};
-  let Inst{43-47} = RA;
-  let Inst{48-63} = D{15-0}; // d1
+  let Inst{38...42} = RST{4...0};
+  let Inst{43...47} = RA;
+  let Inst{48...63} = D{15...0}; // d1
 }
 
 // 8LS:D-Form: [ 1 0 0 // R // d0
@@ -298,18 +298,18 @@ class 8LS_DForm_R_SI34_XT6_RA5_MEM<bits<5> opcode, dag OOL, dag IOL,
   let Pattern = pattern;
 
   // The prefix.
-  let Inst{6-7} = 0;
+  let Inst{6...7} = 0;
   let Inst{8} = 0;
-  let Inst{9-10} = 0; // reserved
+  let Inst{9...10} = 0; // reserved
   let Inst{11} = PCRel;
-  let Inst{12-13} = 0; // reserved
-  let Inst{14-31} = D{33-16}; // d0
+  let Inst{12...13} = 0; // reserved
+  let Inst{14...31} = D{33...16}; // d0
 
   // The instruction.
   let Inst{37} = XST{5};
-  let Inst{38-42} = XST{4-0};
-  let Inst{43-47} = RA;
-  let Inst{48-63} = D{15-0}; // d1
+  let Inst{38...42} = XST{4...0};
+  let Inst{43...47} = RA;
+  let Inst{48...63} = D{15...0}; // d1
 }
 
 // X-Form: [PO T IMM VRB XO TX]
@@ -321,10 +321,10 @@ class XForm_XT6_IMM5_VB5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
   bits<5> IMM;
 
   let Pattern = pattern;
-  let Inst{6-10} = XT{4-0};
-  let Inst{11-15} = IMM;
-  let Inst{16-20} = VRB;
-  let Inst{21-30} = xo;
+  let Inst{6...10} = XT{4...0};
+  let Inst{11...15} = IMM;
+  let Inst{16...20} = VRB;
+  let Inst{21...30} = xo;
   let Inst{31} = XT{5};
 }
 
@@ -341,19 +341,19 @@ class 8RR_XX4Form_IMM8_XTAB6<bits<6> opcode, bits<2> xo,
     let Pattern = pattern;
 
     // The prefix.
-    let Inst{6-7} = 1;
+    let Inst{6...7} = 1;
     let Inst{8} = 0;
-    let Inst{9-11} = 0;
-    let Inst{12-13} = 0;
-    let Inst{14-23} = 0;
-    let Inst{24-31} = IMM;
+    let Inst{9...11} = 0;
+    let Inst{12...13} = 0;
+    let Inst{14...23} = 0;
+    let Inst{24...31} = IMM;
 
     // The instruction.
-    let Inst{38-42} = XT{4-0};
-    let Inst{43-47} = XA{4-0};
-    let Inst{48-52} = XB{4-0};
-    let Inst{53-57} = XC{4-0};
-    let Inst{58-59} = xo;
+    let Inst{38...42} = XT{4...0};
+    let Inst{43...47} = XA{4...0};
+    let Inst{48...52} = XB{4...0};
+    let Inst{53...57} = XC{4...0};
+    let Inst{58...59} = xo;
     let Inst{60} = XC{5};
     let Inst{61} = XA{5};
     let Inst{62} = XB{5};
@@ -369,11 +369,11 @@ class VXForm_RD5_N3_VB5<bits<11> xo, dag OOL, dag IOL, string asmstr,
 
   let Pattern = pattern;
 
-  let Inst{6-10}  = RD;
-  let Inst{11-12} = 0;
-  let Inst{13-15} = N;
-  let Inst{16-20} = VB;
-  let Inst{21-31} = xo;
+  let Inst{6...10}  = RD;
+  let Inst{11...12} = 0;
+  let Inst{13...15} = N;
+  let Inst{16...20} = VB;
+  let Inst{21...31} = xo;
 }
 
 
@@ -382,14 +382,14 @@ class VXForm_RD5_N3_VB5<bits<11> xo, dag OOL, dag IOL, string asmstr,
 class VXForm_VTB5_RA5_ins<bits<11> xo, string opc, list<dag> pattern>
   : VXForm_1<xo, (outs vrrc:$VD), (ins vrrc:$VDi, gprc:$VA, vrrc:$VB),
              !strconcat(opc, " $VD, $VA, $VB"), IIC_VecGeneral, pattern>,
-             RegConstraint<"$VDi = $VD">, NoEncode<"$VDi">;
+             RegConstraint<"$VDi = $VD">;
 
 // VX-Form: [PO VRT RA RB XO].
 // Destructive (insert) forms are suffixed with _ins.
 class VXForm_VRT5_RAB5_ins<bits<11> xo, string opc, list<dag> pattern>
   : VXForm_1<xo, (outs vrrc:$VD), (ins vrrc:$VDi, gprc:$VA, gprc:$VB),
              !strconcat(opc, " $VD, $VA, $VB"), IIC_VecGeneral, pattern>,
-             RegConstraint<"$VDi = $VD">, NoEncode<"$VDi">;
+             RegConstraint<"$VDi = $VD">;
 
 // VX-Form: [ PO BF // VRA VRB XO ]
 class VXForm_BF3_VAB5<bits<11> xo, dag OOL, dag IOL, string asmstr,
@@ -401,11 +401,11 @@ class VXForm_BF3_VAB5<bits<11> xo, dag OOL, dag IOL, string asmstr,
 
   let Pattern = pattern;
 
-  let Inst{6-8} = BF;
-  let Inst{9-10} = 0;
-  let Inst{11-15} = VA;
-  let Inst{16-20} = VB;
-  let Inst{21-31} = xo;
+  let Inst{6...8} = BF;
+  let Inst{9...10} = 0;
+  let Inst{11...15} = VA;
+  let Inst{16...20} = VB;
+  let Inst{21...31} = xo;
 }
 
 // VN-Form: [PO VRT VRA VRB PS SD XO]
@@ -420,12 +420,12 @@ class VNForm_VTAB5_SD3<bits<6> xo, bits<2> ps, dag OOL, dag IOL, string asmstr,
 
   let Pattern = pattern;
 
-  let Inst{6-10}  = VRT;
-  let Inst{11-15} = VRA;
-  let Inst{16-20} = VRB;
-  let Inst{21-22} = ps;
-  let Inst{23-25} = SD;
-  let Inst{26-31} = xo;
+  let Inst{6...10}  = VRT;
+  let Inst{11...15} = VRA;
+  let Inst{16...20} = VRB;
+  let Inst{21...22} = ps;
+  let Inst{23...25} = SD;
+  let Inst{26...31} = xo;
 }
 
 class VXForm_RD5_MP_VB5<bits<11> xo, bits<4> eo, dag OOL, dag IOL,
@@ -437,11 +437,11 @@ class VXForm_RD5_MP_VB5<bits<11> xo, bits<4> eo, dag OOL, dag IOL,
 
   let Pattern = pattern;
 
-  let Inst{6-10}  = RD;
-  let Inst{11-14} = eo;
+  let Inst{6...10}  = RD;
+  let Inst{11...14} = eo;
   let Inst{15} = MP;
-  let Inst{16-20} = VB;
-  let Inst{21-31} = xo;
+  let Inst{16...20} = VB;
+  let Inst{21...31} = xo;
 }
 
 // 8RR:D-Form: [ 1 1 0 // // imm0
@@ -456,17 +456,17 @@ class 8RR_DForm_IMM32_XT6<bits<6> opcode, bits<4> xo, dag OOL, dag IOL,
   let Pattern = pattern;
 
   // The prefix.
-  let Inst{6-7} = 1;
-  let Inst{8-11} = 0;
-  let Inst{12-13} = 0; // reserved
-  let Inst{14-15} = 0; // reserved
-  let Inst{16-31} = IMM32{31-16};
+  let Inst{6...7} = 1;
+  let Inst{8...11} = 0;
+  let Inst{12...13} = 0; // reserved
+  let Inst{14...15} = 0; // reserved
+  let Inst{16...31} = IMM32{31...16};
 
   // The instruction.
-  let Inst{38-42} = XT{4-0};
-  let Inst{43-46} = xo;
+  let Inst{38...42} = XT{4...0};
+  let Inst{43...46} = xo;
   let Inst{47} = XT{5};
-  let Inst{48-63} = IMM32{15-0};
+  let Inst{48...63} = IMM32{15...0};
 }
 
 // 8RR:D-Form: [ 1 1 0 // // imm0
@@ -482,18 +482,18 @@ class 8RR_DForm_IMM32_XT6_IX<bits<6> opcode, bits<3> xo, dag OOL, dag IOL,
   let Pattern = pattern;
 
   // The prefix.
-  let Inst{6-7} = 1;
-  let Inst{8-11} = 0;
-  let Inst{12-13} = 0; // reserved
-  let Inst{14-15} = 0; // reserved
-  let Inst{16-31} = IMM32{31-16};
+  let Inst{6...7} = 1;
+  let Inst{8...11} = 0;
+  let Inst{12...13} = 0; // reserved
+  let Inst{14...15} = 0; // reserved
+  let Inst{16...31} = IMM32{31...16};
 
   // The instruction.
-  let Inst{38-42} = XT{4-0};
-  let Inst{43-45} = xo;
+  let Inst{38...42} = XT{4...0};
+  let Inst{43...45} = xo;
   let Inst{46} = IX;
   let Inst{47} = XT{5};
-  let Inst{48-63} = IMM32{15-0};
+  let Inst{48...63} = IMM32{15...0};
 }
 
 class 8RR_XX4Form_XTABC6<bits<6> opcode, bits<2> xo, dag OOL, dag IOL,
@@ -507,17 +507,17 @@ class 8RR_XX4Form_XTABC6<bits<6> opcode, bits<2> xo, dag OOL, dag IOL,
   let Pattern = pattern;
 
   // The prefix.
-  let Inst{6-7} = 1;
-  let Inst{8-11} = 0;
-  let Inst{12-13} = 0;
-  let Inst{14-31} = 0;
+  let Inst{6...7} = 1;
+  let Inst{8...11} = 0;
+  let Inst{12...13} = 0;
+  let Inst{14...31} = 0;
 
   // The instruction.
-  let Inst{38-42} = XT{4-0};
-  let Inst{43-47} = XA{4-0};
-  let Inst{48-52} = XB{4-0};
-  let Inst{53-57} = XC{4-0};
-  let Inst{58-59} = xo;
+  let Inst{38...42} = XT{4...0};
+  let Inst{43...47} = XA{4...0};
+  let Inst{48...52} = XB{4...0};
+  let Inst{53...57} = XC{4...0};
+  let Inst{58...59} = xo;
   let Inst{60} = XC{5};
   let Inst{61} = XA{5};
   let Inst{62} = XB{5};
@@ -537,18 +537,18 @@ class 8RR_XX4Form_IMM3_XTABC6<bits<6> opcode, bits<2> xo, dag OOL, dag IOL,
   let Pattern = pattern;
 
   // The prefix.
-  let Inst{6-7} = 1;
-  let Inst{8-11} = 0;
-  let Inst{12-13} = 0;
-  let Inst{14-28} = 0;
-  let Inst{29-31} = IMM;
+  let Inst{6...7} = 1;
+  let Inst{8...11} = 0;
+  let Inst{12...13} = 0;
+  let Inst{14...28} = 0;
+  let Inst{29...31} = IMM;
 
   // The instruction.
-  let Inst{38-42} = XT{4-0};
-  let Inst{43-47} = XA{4-0};
-  let Inst{48-52} = XB{4-0};
-  let Inst{53-57} = XC{4-0};
-  let Inst{58-59} = xo;
+  let Inst{38...42} = XT{4...0};
+  let Inst{43...47} = XA{4...0};
+  let Inst{48...52} = XB{4...0};
+  let Inst{53...57} = XC{4...0};
+  let Inst{58...59} = xo;
   let Inst{60} = XC{5};
   let Inst{61} = XA{5};
   let Inst{62} = XB{5};
@@ -565,11 +565,11 @@ class XX2_BF3_XO5_XB6_XO9<bits<6> opcode, bits<5> xo2, bits<9> xo, dag OOL,
 
   let Pattern = pattern;
 
-  let Inst{6-8}   = BF;
-  let Inst{9-10}  = 0;
-  let Inst{11-15} = xo2;
-  let Inst{16-20} = XB{4-0};
-  let Inst{21-29} = xo;
+  let Inst{6...8}   = BF;
+  let Inst{9...10}  = 0;
+  let Inst{11...15} = xo2;
+  let Inst{16...20} = XB{4...0};
+  let Inst{21...29} = xo;
   let Inst{30}    = XB{5};
   let Inst{31}    = 0;
 }
@@ -863,11 +863,11 @@ class DQForm_XTp5_RA17_MEM<bits<6> opcode, bits<4> xo, dag OOL, dag IOL,
 
   let Pattern = pattern;
 
-  let Inst{6-9} = XTp{3-0};
+  let Inst{6...9} = XTp{3...0};
   let Inst{10} = XTp{4};
-  let Inst{11-15} = RA;
-  let Inst{16-27} = DQ;
-  let Inst{28-31} = xo;
+  let Inst{11...15} = RA;
+  let Inst{16...27} = DQ;
+  let Inst{28...31} = xo;
 }
 
 class XForm_XTp5_XAB5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
@@ -878,11 +878,11 @@ class XForm_XTp5_XAB5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
   bits<5> RB;
 
   let Pattern = pattern;
-  let Inst{6-9} = XTp{3-0};
+  let Inst{6...9} = XTp{3...0};
   let Inst{10} = XTp{4};
-  let Inst{11-15} = RA;
-  let Inst{16-20} = RB;
-  let Inst{21-30} = xo;
+  let Inst{11...15} = RA;
+  let Inst{16...20} = RB;
+  let Inst{21...30} = xo;
   let Inst{31} = 0;
 }
 
@@ -896,16 +896,16 @@ class 8LS_DForm_R_XTp5_SI34_MEM<bits<6> opcode, dag OOL, dag IOL, string asmstr,
   let Pattern = pattern;
 
   // The prefix.
-  let Inst{6-10} = 0;
+  let Inst{6...10} = 0;
   let Inst{11} = PCRel;
-  let Inst{12-13} = 0;
-  let Inst{14-31} = D{33-16}; // Imm18
+  let Inst{12...13} = 0;
+  let Inst{14...31} = D{33...16}; // Imm18
 
   // The instruction.
-  let Inst{38-41} = XTp{3-0};
+  let Inst{38...41} = XTp{3...0};
   let Inst{42}    = XTp{4};
-  let Inst{43-47} = RA;
-  let Inst{48-63} = D{15-0};
+  let Inst{43...47} = RA;
+  let Inst{48...63} = D{15...0};
 }
 
 multiclass 8LS_DForm_R_XTp5_SI34_MEM_p<bits<6> opcode, dag OOL,
@@ -935,11 +935,11 @@ class XForm_AT3<bits<6> opcode, bits<5> xo2, bits<10> xo, dag OOL, dag IOL,
 
   let Pattern = pattern;
 
-  let Inst{6-8}  = AT;
-  let Inst{9-10}  = 0;
-  let Inst{11-15} = xo2;
-  let Inst{16-20} = 0;
-  let Inst{21-30} = xo;
+  let Inst{6...8}  = AT;
+  let Inst{9...10}  = 0;
+  let Inst{11...15} = xo2;
+  let Inst{16...20} = 0;
+  let Inst{21...30} = xo;
   let Inst{31} = 0;
 }
 
@@ -952,10 +952,10 @@ class XForm_XT6_IMM5<bits<6> opcode, bits<5> eo, bits<10> xo, dag OOL, dag IOL,
 
   let Pattern = pattern;
 
-  let Inst{6-10} = XT{4-0};
-  let Inst{11-15} = eo;
-  let Inst{16-20} = UIM;
-  let Inst{21-30} = xo;
+  let Inst{6...10} = XT{4...0};
+  let Inst{11...15} = eo;
+  let Inst{16...20} = UIM;
+  let Inst{21...30} = xo;
   let Inst{31} = XT{5};
 }
 
@@ -969,11 +969,11 @@ class XX3Form_AT3_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
 
   let Pattern = pattern;
 
-  let Inst{6-8} = AT;
-  let Inst{9-10} = 0;
-  let Inst{11-15} = XA{4-0};
-  let Inst{16-20} = XB{4-0};
-  let Inst{21-28} = xo;
+  let Inst{6...8} = AT;
+  let Inst{9...10} = 0;
+  let Inst{11...15} = XA{4...0};
+  let Inst{16...20} = XB{4...0};
+  let Inst{21...28} = xo;
   let Inst{29}    = XA{5};
   let Inst{30}    = XB{5};
   let Inst{31} = 0;
@@ -993,20 +993,20 @@ class MMIRR_XX3Form_XY4P2_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
   let Pattern = pattern;
 
   // The prefix.
-  let Inst{6-7} = 3;
-  let Inst{8-11} = 9;
-  let Inst{12-15} = 0;
-  let Inst{16-17} = PMSK;
-  let Inst{18-23} = 0;
-  let Inst{24-27} = XMSK;
-  let Inst{28-31} = YMSK;
+  let Inst{6...7} = 3;
+  let Inst{8...11} = 9;
+  let Inst{12...15} = 0;
+  let Inst{16...17} = PMSK;
+  let Inst{18...23} = 0;
+  let Inst{24...27} = XMSK;
+  let Inst{28...31} = YMSK;
 
   // The instruction.
-  let Inst{38-40} = AT;
-  let Inst{41-42} = 0;
-  let Inst{43-47} = XA{4-0};
-  let Inst{48-52} = XB{4-0};
-  let Inst{53-60} = xo;
+  let Inst{38...40} = AT;
+  let Inst{41...42} = 0;
+  let Inst{43...47} = XA{4...0};
+  let Inst{48...52} = XB{4...0};
+  let Inst{53...60} = xo;
   let Inst{61} = XA{5};
   let Inst{62} = XB{5};
   let Inst{63} = 0;
@@ -1025,18 +1025,18 @@ class MMIRR_XX3Form_XY4_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
   let Pattern = pattern;
 
   // The prefix.
-  let Inst{6-7} = 3;
-  let Inst{8-11} = 9;
-  let Inst{12-23} = 0;
-  let Inst{24-27} = XMSK;
-  let Inst{28-31} = YMSK;
+  let Inst{6...7} = 3;
+  let Inst{8...11} = 9;
+  let Inst{12...23} = 0;
+  let Inst{24...27} = XMSK;
+  let Inst{28...31} = YMSK;
 
   // The instruction.
-  let Inst{38-40} = AT;
-  let Inst{41-42} = 0;
-  let Inst{43-47} = XA{4-0};
-  let Inst{48-52} = XB{4-0};
-  let Inst{53-60} = xo;
+  let Inst{38...40} = AT;
+  let Inst{41...42} = 0;
+  let Inst{43...47} = XA{4...0};
+  let Inst{48...52} = XB{4...0};
+  let Inst{53...60} = xo;
   let Inst{61} = XA{5};
   let Inst{62} = XB{5};
   let Inst{63} = 0;
@@ -1055,19 +1055,19 @@ class MMIRR_XX3Form_X4Y2_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
   let Pattern = pattern;
 
   // The prefix.
-  let Inst{6-7} = 3;
-  let Inst{8-11} = 9;
-  let Inst{12-23} = 0;
-  let Inst{24-27} = XMSK;
-  let Inst{28-29} = YMSK;
-  let Inst{30-31} = 0;
+  let Inst{6...7} = 3;
+  let Inst{8...11} = 9;
+  let Inst{12...23} = 0;
+  let Inst{24...27} = XMSK;
+  let Inst{28...29} = YMSK;
+  let Inst{30...31} = 0;
 
   // The instruction.
-  let Inst{38-40} = AT;
-  let Inst{41-42} = 0;
-  let Inst{43-47} = XA{4-0};
-  let Inst{48-52} = XB{4-0};
-  let Inst{53-60} = xo;
+  let Inst{38...40} = AT;
+  let Inst{41...42} = 0;
+  let Inst{43...47} = XA{4...0};
+  let Inst{48...52} = XB{4...0};
+  let Inst{53...60} = xo;
   let Inst{61} = XA{5};
   let Inst{62} = XB{5};
   let Inst{63} = 0;
@@ -1087,19 +1087,19 @@ class MMIRR_XX3Form_XY4P8_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
   let Pattern = pattern;
 
   // The prefix.
-  let Inst{6-7} = 3;
-  let Inst{8-11} = 9;
-  let Inst{12-15} = 0;
-  let Inst{16-23} = PMSK;
-  let Inst{24-27} = XMSK;
-  let Inst{28-31} = YMSK;
+  let Inst{6...7} = 3;
+  let Inst{8...11} = 9;
+  let Inst{12...15} = 0;
+  let Inst{16...23} = PMSK;
+  let Inst{24...27} = XMSK;
+  let Inst{28...31} = YMSK;
 
   // The instruction.
-  let Inst{38-40} = AT;
-  let Inst{41-42} = 0;
-  let Inst{43-47} = XA{4-0};
-  let Inst{48-52} = XB{4-0};
-  let Inst{53-60} = xo;
+  let Inst{38...40} = AT;
+  let Inst{41...42} = 0;
+  let Inst{43...47} = XA{4...0};
+  let Inst{48...52} = XB{4...0};
+  let Inst{53...60} = xo;
   let Inst{61} = XA{5};
   let Inst{62} = XB{5};
   let Inst{63} = 0;
@@ -1119,20 +1119,20 @@ class MMIRR_XX3Form_XYP4_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
   let Pattern = pattern;
 
   // The prefix.
-  let Inst{6-7} = 3;
-  let Inst{8-11} = 9;
-  let Inst{12-15} = 0;
-  let Inst{16-19} = PMSK;
-  let Inst{20-23} = 0;
-  let Inst{24-27} = XMSK;
-  let Inst{28-31} = YMSK;
+  let Inst{6...7} = 3;
+  let Inst{8...11} = 9;
+  let Inst{12...15} = 0;
+  let Inst{16...19} = PMSK;
+  let Inst{20...23} = 0;
+  let Inst{24...27} = XMSK;
+  let Inst{28...31} = YMSK;
 
   // The instruction.
-  let Inst{38-40} = AT;
-  let Inst{41-42} = 0;
-  let Inst{43-47} = XA{4-0};
-  let Inst{48-52} = XB{4-0};
-  let Inst{53-60} = xo;
+  let Inst{38...40} = AT;
+  let Inst{41...42} = 0;
+  let Inst{43...47} = XA{4...0};
+  let Inst{48...52} = XB{4...0};
+  let Inst{53...60} = xo;
   let Inst{61} = XA{5};
   let Inst{62} = XB{5};
   let Inst{63} = 0;
@@ -1395,7 +1395,7 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1, Predicates = [P
                              [(set v2i64:$XT,
                                    (PPCxxsplti32dx v2i64:$XTi, i32:$IX,
                                                    i32:$IMM32))]>,
-                             RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">;
+                             RegConstraint<"$XTi = $XT">;
 }
 
 let Predicates = [IsISA3_1] in {
@@ -1466,13 +1466,13 @@ let Predicates = [IsISA3_1] in {
              "vinsw $VD, $VB, $VA", IIC_VecGeneral,
              [(set v4i32:$VD,
                    (int_ppc_altivec_vinsw v4i32:$VDi, i32:$VB, timm:$VA))]>,
-             RegConstraint<"$VDi = $VD">, NoEncode<"$VDi">;
+             RegConstraint<"$VDi = $VD">;
   def VINSD :
     VXForm_1<463, (outs vrrc:$VD), (ins vrrc:$VDi, u4imm:$VA, g8rc:$VB),
              "vinsd $VD, $VB, $VA", IIC_VecGeneral,
              [(set v2i64:$VD,
                    (int_ppc_altivec_vinsd v2i64:$VDi, i64:$VB, timm:$VA))]>,
-             RegConstraint<"$VDi = $VD">, NoEncode<"$VDi">;
+             RegConstraint<"$VDi = $VD">;
   def VINSBVLX :
     VXForm_VTB5_RA5_ins<15, "vinsbvlx",
                         [(set v16i8:$VD,
@@ -1538,13 +1538,13 @@ let Predicates = [IsISA3_1] in {
              "vinsdlx $VD, $VA, $VB", IIC_VecGeneral,
               [(set v2i64:$VD,
                     (int_ppc_altivec_vinsdlx v2i64:$VDi, i64:$VA, i64:$VB))]>,
-              RegConstraint<"$VDi = $VD">, NoEncode<"$VDi">;
+              RegConstraint<"$VDi = $VD">;
   def VINSDRX :
     VXForm_1<975, (outs vrrc:$VD), (ins vrrc:$VDi, g8rc:$VA, g8rc:$VB),
              "vinsdrx $VD, $VA, $VB", IIC_VecGeneral,
               [(set v2i64:$VD,
                     (int_ppc_altivec_vinsdrx v2i64:$VDi, i64:$VA, i64:$VB))]>,
-              RegConstraint<"$VDi = $VD">, NoEncode<"$VDi">;
+              RegConstraint<"$VDi = $VD">;
   def VEXTRACTBM : VXForm_RD5_XO5_RS5<1602, 8, (outs gprc:$VD), (ins vrrc:$VB),
                                       "vextractbm $VD, $VB", IIC_VecGeneral,
                                       [(set i32:$VD,
@@ -1915,10 +1915,11 @@ let Predicates = [IsISA3_1] in {
                         [(set v1i128:$VD,
                           (int_ppc_altivec_vrlqmi v1i128:$VA, v1i128:$VB,
                                                   v1i128:$VDi))]>,
-                        RegConstraint<"$VDi = $VD">, NoEncode<"$VDi">;
+                        RegConstraint<"$VDi = $VD">;
   def VSLQ : VX1_VT5_VA5_VB5<261, "vslq", []>;
   def VSRAQ : VX1_VT5_VA5_VB5<773, "vsraq", []>;
-  def VSRQ : VX1_VT5_VA5_VB5<517, "vsrq", []>;
+  def VSRQ : VX1_VT5_VA5_VB5<517, "vsrq", 
+                            [(set v4i32:$VD, (PPCvsrq v4i32:$VA, v4i32:$VB))]>;
   def VRLQ : VX1_VT5_VA5_VB5<5, "vrlq", []>;
   def XSCVQPUQZ : X_VT5_XO5_VB5<63, 0, 836, "xscvqpuqz", []>;
   def XSCVQPSQZ : X_VT5_XO5_VB5<63, 8, 836, "xscvqpsqz", []>;
@@ -2053,6 +2054,9 @@ let Predicates = [IsISA3_1, HasFPU] in {
 
 //---------------------------- Anonymous Patterns ----------------------------//
 let Predicates = [IsISA3_1] in {
+  // Exploit vsrq instruction to optimize VSR(VSRO (input, vsro_byte_shift), vsr_bit_shift)
+  // to VSRQ(input, vsrq_bit_shift)
+  def : Pat<(VSRVSRO v4i32:$vA, v4i32:$vB), (VSRQ $vA, $vB)>;
   // Exploit the vector multiply high instructions using intrinsics.
   def : Pat<(v4i32 (int_ppc_altivec_vmulhsw v4i32:$vA, v4i32:$vB)),
             (v4i32 (VMULHSW $vA, $vB))>;
@@ -2230,6 +2234,13 @@ def VEqv
                                             (v4i32(bitconvert node:$a)),
                                             (v4i32(bitconvert node:$b)))))]>;
 
+// Vector NAND operation (not(and))
+def VNand
+    : PatFrags<(ops node:$a, node:$b), [(vnot(and node:$a, node:$b)),
+                                        (bitconvert(vnot(and
+                                            (v4i32(bitconvert node:$a)),
+                                            (v4i32(bitconvert node:$b)))))]>;
+
 // =============================================================================
 // XXEVAL Ternary Pattern Multiclass: XXEvalTernarySelectAnd
 // This class matches the equivalent Ternary Operation: A ? f(B,C) : AND(B,C)
@@ -2265,6 +2276,56 @@ multiclass XXEvalTernarySelectAnd<ValueType Vt> {
             Vt, (vselect Vt:$vA, (VNot Vt:$vB), (VAnd Vt:$vB, Vt:$vC)), 28>;
 }
 
+// =============================================================================
+// XXEVAL Ternary Pattern Multiclass: XXEvalTernarySelectB
+// This class matches the equivalent Ternary Operation: A ? f(B,C) : B
+// and emit the corresponding xxeval instruction with the imm value.
+//
+// The patterns implement xxeval vector select operations where:
+// - A is the selector vector
+// - f(B,C) is the "true" case op on vectors B and C (AND, NOR, EQV, NAND)
+// - B is the "false" case operand (vector B)
+//
+// Note: Patterns (A? C : B) and (A? not(C) : B) are not considered
+// for XXEVAL instruction (4 Cycle) as XXSEL (3 cycle) instruction performs 
+// better.
+// =============================================================================
+multiclass XXEvalTernarySelectB<ValueType Vt>{
+  // Pattern: (A ? AND(B,C) : B) XXEVAL immediate value: 49
+  def : XXEvalPattern<Vt, (vselect Vt:$vA, (VAnd Vt:$vB, Vt:$vC), Vt:$vB), 49>;
+  // Pattern: (A ? NOR(B,C) : B) XXEVAL immediate value: 56
+  def : XXEvalPattern<Vt, (vselect Vt:$vA, (VNor Vt:$vB, Vt:$vC), Vt:$vB), 56>;
+  // Pattern: (A ? EQV(B,C) : B) XXEVAL immediate value: 57
+  def : XXEvalPattern<Vt, (vselect Vt:$vA, (VEqv Vt:$vB, Vt:$vC), Vt:$vB), 57>;
+  // Pattern: (A ? NAND(B,C) : B) XXEVAL immediate value: 62
+  def : XXEvalPattern<Vt, (vselect Vt:$vA, (VNand Vt:$vB, Vt:$vC), Vt:$vB), 62>;
+}
+
+// =============================================================================
+// XXEVAL Ternary Pattern Multiclass: XXEvalTernarySelectC
+// This class matches the equivalent Ternary Operation: A ? f(B,C) : C
+// and emit the corresponding xxeval instruction with the imm value.
+//
+// The patterns implement xxeval vector select operations where:
+// - A is the selector vector
+// - f(B,C) is the "true" case op on vectors B and C (AND, NOR, EQV, NAND)
+// - C is the "false" case operand (vector C)
+//
+// Note: Patterns (A? B : C) and (A? not(B) : C) are not considered
+// for XXEVAL instruction (4 Cycle) as XXSEL (3 cycle) instruction performs 
+// better.
+// =============================================================================
+multiclass XXEvalTernarySelectC<ValueType Vt>{
+  // Pattern: (A ? AND(B,C) : C) XXEVAL immediate value: 81
+  def : XXEvalPattern<Vt, (vselect Vt:$vA, (VAnd Vt:$vB, Vt:$vC), Vt:$vC), 81>;
+  // Pattern: (A ? NOR(B,C) : C) XXEVAL immediate value: 88
+  def : XXEvalPattern<Vt, (vselect Vt:$vA, (VNor Vt:$vB, Vt:$vC), Vt:$vC), 88>;
+  // Pattern: (A ? EQV(B,C) : C) XXEVAL immediate value: 89
+  def : XXEvalPattern<Vt, (vselect Vt:$vA, (VEqv Vt:$vB, Vt:$vC), Vt:$vC), 89>;
+  // Pattern: (A ? NAND(B,C) : C) XXEVAL immediate value: 94
+  def : XXEvalPattern<Vt, (vselect Vt:$vA, (VNand Vt:$vB, Vt:$vC), Vt:$vC), 94>;
+}
+
 let Predicates = [PrefixInstrs, HasP10Vector] in {
   let AddedComplexity = 400 in {
     def : Pat<(v4i32 (build_vector i32immNonAllOneNonZero:$A,
@@ -2376,6 +2437,8 @@ let Predicates = [PrefixInstrs, HasP10Vector] in {
     // XXEval Patterns for ternary Operations.
     foreach Ty = [v4i32, v2i64, v8i16, v16i8] in {
         defm : XXEvalTernarySelectAnd<Ty>;
+        defm : XXEvalTernarySelectB<Ty>;
+        defm : XXEvalTernarySelectC<Ty>; 
     }
 
     // Anonymous patterns to select prefixed VSX loads and stores.
diff --git a/llvm/lib/Target/PowerPC/PPCInstrSPE.td b/llvm/lib/Target/PowerPC/PPCInstrSPE.td
index e91cae349e08..5104cc6f5607 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrSPE.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrSPE.td
@@ -20,10 +20,10 @@ class EFXForm_1<bits<11> xo, dag OOL, dag IOL, string asmstr,
 
   let Pattern = pattern;
 
-  let Inst{6-10}  = RT;
-  let Inst{11-15} = RA;
-  let Inst{16-20} = RB;
-  let Inst{21-31} = xo;
+  let Inst{6...10}  = RT;
+  let Inst{11...15} = RA;
+  let Inst{16...20} = RB;
+  let Inst{21...31} = xo;
 }
 
 class EFXForm_2<bits<11> xo, dag OOL, dag IOL, string asmstr,
@@ -45,11 +45,11 @@ class EFXForm_3<bits<11> xo, dag OOL, dag IOL, string asmstr,
   bits<5> RA;
   bits<5> RB;
 
-  let Inst{6-8}  = crD;
-  let Inst{9-10}  = 0;
-  let Inst{11-15} = RA;
-  let Inst{16-20} = RB;
-  let Inst{21-31} = xo;
+  let Inst{6...8}  = crD;
+  let Inst{9...10}  = 0;
+  let Inst{11...15} = RA;
+  let Inst{16...20} = RB;
+  let Inst{21...31} = xo;
 }
 
 class EVXForm_1<bits<11> xo, dag OOL, dag IOL, string asmstr,
@@ -61,10 +61,10 @@ class EVXForm_1<bits<11> xo, dag OOL, dag IOL, string asmstr,
 
   let Pattern = pattern;
 
-  let Inst{6-10}  = RT;
-  let Inst{11-15} = RA;
-  let Inst{16-20} = RB;
-  let Inst{21-31} = xo;
+  let Inst{6...10}  = RT;
+  let Inst{11...15} = RA;
+  let Inst{16...20} = RB;
+  let Inst{21...31} = xo;
 }
 
 class EVXForm_2<bits<11> xo, dag OOL, dag IOL, string asmstr,
@@ -88,11 +88,11 @@ class EVXForm_3<bits<11> xo, dag OOL, dag IOL, string asmstr,
 
   let Pattern = pattern;
 
-  let Inst{6-8}  = crD;
-  let Inst{9-10}  = 0;
-  let Inst{11-15} = RA;
-  let Inst{16-20} = RB;
-  let Inst{21-31} = xo;
+  let Inst{6...8}  = crD;
+  let Inst{9...10}  = 0;
+  let Inst{11...15} = RA;
+  let Inst{16...20} = RB;
+  let Inst{21...31} = xo;
 }
 
 class EVXForm_4<bits<8> xo, dag OOL, dag IOL, string asmstr,
@@ -105,11 +105,11 @@ class EVXForm_4<bits<8> xo, dag OOL, dag IOL, string asmstr,
 
   let Pattern = pattern;
 
-  let Inst{6-10}  = RT;
-  let Inst{11-15} = RA;
-  let Inst{16-20} = RB;
-  let Inst{21-28} = xo;
-  let Inst{29-31} = crD;
+  let Inst{6...10}  = RT;
+  let Inst{11...15} = RA;
+  let Inst{16...20} = RB;
+  let Inst{21...28} = xo;
+  let Inst{29...31} = crD;
 }
 
 class EVXForm_D<bits<11> xo, dag OOL, dag IOL, string asmstr,
@@ -121,10 +121,10 @@ class EVXForm_D<bits<11> xo, dag OOL, dag IOL, string asmstr,
 
   let Pattern = pattern;
 
-  let Inst{6-10}  = RT;
-  let Inst{11-15} = RA;
-  let Inst{16-20} = D;
-  let Inst{21-31} = xo;
+  let Inst{6...10}  = RT;
+  let Inst{11...15} = RA;
+  let Inst{16...20} = D;
+  let Inst{21...31} = xo;
 }
 
 let DecoderNamespace = "SPE", Predicates = [HasSPE] in {
diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index 19448210f5db..4e5165bfcda5 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -236,7 +236,7 @@ class X_VT5_VA5_VB5_FMA<bits<6> opcode, bits<10> xo, string opc,
                         list<dag> pattern>
   : XForm_1<opcode, xo, (outs vrrc:$RST), (ins vrrc:$RSTi, vrrc:$RA, vrrc:$RB),
             !strconcat(opc, " $RST, $RA, $RB"), IIC_VecFP, pattern>,
-            RegConstraint<"$RSTi = $RST">, NoEncode<"$RSTi">;
+            RegConstraint<"$RSTi = $RST">;
 
 // [PO VRT VRA VRB XO RO], Round to Odd version of [PO VRT VRA VRB XO /]
 class X_VT5_VA5_VB5_FMA_Ro<bits<6> opcode, bits<10> xo, string opc,
@@ -402,13 +402,13 @@ let hasSideEffects = 0 in {
                           (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
                           "xsmaddadp $XT, $XA, $XB", IIC_VecFP,
                           [(set f64:$XT, (any_fma f64:$XA, f64:$XB, f64:$XTi))]>,
-                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          RegConstraint<"$XTi = $XT">,
                           AltVSXFMARel;
   let IsVSXFMAAlt = 1 in
   def XSMADDMDP : XX3Form<60, 41,
                           (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
                           "xsmaddmdp $XT, $XA, $XB", IIC_VecFP, []>,
-                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          RegConstraint<"$XTi = $XT">,
                           AltVSXFMARel;
   }
 
@@ -418,13 +418,13 @@ let hasSideEffects = 0 in {
                           (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
                           "xsmsubadp $XT, $XA, $XB", IIC_VecFP,
                           [(set f64:$XT, (any_fma f64:$XA, f64:$XB, (fneg f64:$XTi)))]>,
-                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          RegConstraint<"$XTi = $XT">,
                           AltVSXFMARel;
   let IsVSXFMAAlt = 1 in
   def XSMSUBMDP : XX3Form<60, 57,
                           (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
                           "xsmsubmdp $XT, $XA, $XB", IIC_VecFP, []>,
-                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          RegConstraint<"$XTi = $XT">,
                           AltVSXFMARel;
   }
 
@@ -434,13 +434,13 @@ let hasSideEffects = 0 in {
                           (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
                           "xsnmaddadp $XT, $XA, $XB", IIC_VecFP,
                           [(set f64:$XT, (fneg (any_fma f64:$XA, f64:$XB, f64:$XTi)))]>,
-                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          RegConstraint<"$XTi = $XT">,
                           AltVSXFMARel;
   let IsVSXFMAAlt = 1 in
   def XSNMADDMDP : XX3Form<60, 169,
                           (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
                           "xsnmaddmdp $XT, $XA, $XB", IIC_VecFP, []>,
-                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          RegConstraint<"$XTi = $XT">,
                           AltVSXFMARel;
   }
 
@@ -450,13 +450,13 @@ let hasSideEffects = 0 in {
                           (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
                           "xsnmsubadp $XT, $XA, $XB", IIC_VecFP,
                           [(set f64:$XT, (fneg (any_fma f64:$XA, f64:$XB, (fneg f64:$XTi))))]>,
-                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          RegConstraint<"$XTi = $XT">,
                           AltVSXFMARel;
   let IsVSXFMAAlt = 1 in
   def XSNMSUBMDP : XX3Form<60, 185,
                           (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
                           "xsnmsubmdp $XT, $XA, $XB", IIC_VecFP, []>,
-                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          RegConstraint<"$XTi = $XT">,
                           AltVSXFMARel;
   }
 
@@ -466,13 +466,13 @@ let hasSideEffects = 0 in {
                           (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
                           "xvmaddadp $XT, $XA, $XB", IIC_VecFP,
                           [(set v2f64:$XT, (any_fma v2f64:$XA, v2f64:$XB, v2f64:$XTi))]>,
-                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          RegConstraint<"$XTi = $XT">,
                           AltVSXFMARel;
   let IsVSXFMAAlt = 1 in
   def XVMADDMDP : XX3Form<60, 105,
                           (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
                           "xvmaddmdp $XT, $XA, $XB", IIC_VecFP, []>,
-                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          RegConstraint<"$XTi = $XT">,
                           AltVSXFMARel;
   }
 
@@ -482,13 +482,13 @@ let hasSideEffects = 0 in {
                           (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
                           "xvmaddasp $XT, $XA, $XB", IIC_VecFP,
                           [(set v4f32:$XT, (any_fma v4f32:$XA, v4f32:$XB, v4f32:$XTi))]>,
-                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          RegConstraint<"$XTi = $XT">,
                           AltVSXFMARel;
   let IsVSXFMAAlt = 1 in
   def XVMADDMSP : XX3Form<60, 73,
                           (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
                           "xvmaddmsp $XT, $XA, $XB", IIC_VecFP, []>,
-                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          RegConstraint<"$XTi = $XT">,
                           AltVSXFMARel;
   }
 
@@ -498,13 +498,13 @@ let hasSideEffects = 0 in {
                           (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
                           "xvmsubadp $XT, $XA, $XB", IIC_VecFP,
                           [(set v2f64:$XT, (any_fma v2f64:$XA, v2f64:$XB, (fneg v2f64:$XTi)))]>,
-                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          RegConstraint<"$XTi = $XT">,
                           AltVSXFMARel;
   let IsVSXFMAAlt = 1 in
   def XVMSUBMDP : XX3Form<60, 121,
                           (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
                           "xvmsubmdp $XT, $XA, $XB", IIC_VecFP, []>,
-                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          RegConstraint<"$XTi = $XT">,
                           AltVSXFMARel;
   }
 
@@ -514,13 +514,13 @@ let hasSideEffects = 0 in {
                           (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
                           "xvmsubasp $XT, $XA, $XB", IIC_VecFP,
                           [(set v4f32:$XT, (any_fma v4f32:$XA, v4f32:$XB, (fneg v4f32:$XTi)))]>,
-                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          RegConstraint<"$XTi = $XT">,
                           AltVSXFMARel;
   let IsVSXFMAAlt = 1 in
   def XVMSUBMSP : XX3Form<60, 89,
                           (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
                           "xvmsubmsp $XT, $XA, $XB", IIC_VecFP, []>,
-                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          RegConstraint<"$XTi = $XT">,
                           AltVSXFMARel;
   }
 
@@ -530,13 +530,13 @@ let hasSideEffects = 0 in {
                           (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
                           "xvnmaddadp $XT, $XA, $XB", IIC_VecFP,
                           [(set v2f64:$XT, (fneg (any_fma v2f64:$XA, v2f64:$XB, v2f64:$XTi)))]>,
-                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          RegConstraint<"$XTi = $XT">,
                           AltVSXFMARel;
   let IsVSXFMAAlt = 1 in
   def XVNMADDMDP : XX3Form<60, 233,
                           (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
                           "xvnmaddmdp $XT, $XA, $XB", IIC_VecFP, []>,
-                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          RegConstraint<"$XTi = $XT">,
                           AltVSXFMARel;
   }
 
@@ -546,13 +546,13 @@ let hasSideEffects = 0 in {
                           (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
                           "xvnmaddasp $XT, $XA, $XB", IIC_VecFP,
                           [(set v4f32:$XT, (fneg (fma v4f32:$XA, v4f32:$XB, v4f32:$XTi)))]>,
-                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          RegConstraint<"$XTi = $XT">,
                           AltVSXFMARel;
   let IsVSXFMAAlt = 1 in
   def XVNMADDMSP : XX3Form<60, 201,
                           (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
                           "xvnmaddmsp $XT, $XA, $XB", IIC_VecFP, []>,
-                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          RegConstraint<"$XTi = $XT">,
                           AltVSXFMARel;
   }
 
@@ -562,13 +562,13 @@ let hasSideEffects = 0 in {
                           (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
                           "xvnmsubadp $XT, $XA, $XB", IIC_VecFP,
                           [(set v2f64:$XT, (fneg (any_fma v2f64:$XA, v2f64:$XB, (fneg v2f64:$XTi))))]>,
-                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          RegConstraint<"$XTi = $XT">,
                           AltVSXFMARel;
   let IsVSXFMAAlt = 1 in
   def XVNMSUBMDP : XX3Form<60, 249,
                           (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
                           "xvnmsubmdp $XT, $XA, $XB", IIC_VecFP, []>,
-                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          RegConstraint<"$XTi = $XT">,
                           AltVSXFMARel;
   }
 
@@ -578,13 +578,13 @@ let hasSideEffects = 0 in {
                           (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
                           "xvnmsubasp $XT, $XA, $XB", IIC_VecFP,
                           [(set v4f32:$XT, (fneg (any_fma v4f32:$XA, v4f32:$XB, (fneg v4f32:$XTi))))]>,
-                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          RegConstraint<"$XTi = $XT">,
                           AltVSXFMARel;
   let IsVSXFMAAlt = 1 in
   def XVNMSUBMSP : XX3Form<60, 217,
                           (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
                           "xvnmsubmsp $XT, $XA, $XB", IIC_VecFP, []>,
-                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          RegConstraint<"$XTi = $XT">,
                           AltVSXFMARel;
   }
 
@@ -1199,7 +1199,7 @@ let Predicates = [HasVSX, HasP8Vector] in {
                           (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
                           "xsmaddasp $XT, $XA, $XB", IIC_VecFP,
                           [(set f32:$XT, (any_fma f32:$XA, f32:$XB, f32:$XTi))]>,
-                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          RegConstraint<"$XTi = $XT">,
                           AltVSXFMARel;
   // FIXME: Setting the hasSideEffects flag here to match current behaviour.
   let IsVSXFMAAlt = 1, hasSideEffects = 1 in
@@ -1207,7 +1207,7 @@ let Predicates = [HasVSX, HasP8Vector] in {
                           (outs vssrc:$XT),
                           (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
                           "xsmaddmsp $XT, $XA, $XB", IIC_VecFP, []>,
-                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          RegConstraint<"$XTi = $XT">,
                           AltVSXFMARel;
   }
 
@@ -1219,7 +1219,7 @@ let Predicates = [HasVSX, HasP8Vector] in {
                           "xsmsubasp $XT, $XA, $XB", IIC_VecFP,
                           [(set f32:$XT, (any_fma f32:$XA, f32:$XB,
                                               (fneg f32:$XTi)))]>,
-                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          RegConstraint<"$XTi = $XT">,
                           AltVSXFMARel;
   // FIXME: Setting the hasSideEffects flag here to match current behaviour.
   let IsVSXFMAAlt = 1, hasSideEffects = 1 in
@@ -1227,7 +1227,7 @@ let Predicates = [HasVSX, HasP8Vector] in {
                           (outs vssrc:$XT),
                           (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
                           "xsmsubmsp $XT, $XA, $XB", IIC_VecFP, []>,
-                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          RegConstraint<"$XTi = $XT">,
                           AltVSXFMARel;
   }
 
@@ -1239,7 +1239,7 @@ let Predicates = [HasVSX, HasP8Vector] in {
                           "xsnmaddasp $XT, $XA, $XB", IIC_VecFP,
                           [(set f32:$XT, (fneg (any_fma f32:$XA, f32:$XB,
                                                     f32:$XTi)))]>,
-                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          RegConstraint<"$XTi = $XT">,
                           AltVSXFMARel;
   // FIXME: Setting the hasSideEffects flag here to match current behaviour.
   let IsVSXFMAAlt = 1, hasSideEffects = 1 in
@@ -1247,7 +1247,7 @@ let Predicates = [HasVSX, HasP8Vector] in {
                           (outs vssrc:$XT),
                           (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
                           "xsnmaddmsp $XT, $XA, $XB", IIC_VecFP, []>,
-                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          RegConstraint<"$XTi = $XT">,
                           AltVSXFMARel;
   }
 
@@ -1259,7 +1259,7 @@ let Predicates = [HasVSX, HasP8Vector] in {
                           "xsnmsubasp $XT, $XA, $XB", IIC_VecFP,
                           [(set f32:$XT, (fneg (any_fma f32:$XA, f32:$XB,
                                                     (fneg f32:$XTi))))]>,
-                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          RegConstraint<"$XTi = $XT">,
                           AltVSXFMARel;
   // FIXME: Setting the hasSideEffects flag here to match current behaviour.
   let IsVSXFMAAlt = 1, hasSideEffects = 1 in
@@ -1267,7 +1267,7 @@ let Predicates = [HasVSX, HasP8Vector] in {
                           (outs vssrc:$XT),
                           (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
                           "xsnmsubmsp $XT, $XA, $XB", IIC_VecFP, []>,
-                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          RegConstraint<"$XTi = $XT">,
                           AltVSXFMARel;
   }
 
@@ -1563,7 +1563,7 @@ let Predicates = [HasVSX, HasP9Vector] in {
                      "xxinsertw $XT, $XB, $UIM5", IIC_VecFP,
                      [(set v4i32:$XT, (PPCvecinsert v4i32:$XTi, v4i32:$XB,
                                                    imm32SExt16:$UIM5))]>,
-                     RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">;
+                     RegConstraint<"$XTi = $XT">;
 
   // Vector Extract Unsigned Word
   // FIXME: Setting the hasSideEffects flag here to match current behaviour.
@@ -1652,11 +1652,11 @@ let Predicates = [HasVSX, HasP9Vector] in {
   def XXPERM  : XX3Form<60, 26, (outs vsrc:$XT),
                                 (ins vsrc:$XA, vsrc:$XTi, vsrc:$XB),
                         "xxperm $XT, $XA, $XB", IIC_VecPerm, []>,
-                        RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">;
+                        RegConstraint<"$XTi = $XT">;
   def XXPERMR : XX3Form<60, 58, (outs vsrc:$XT),
                                 (ins vsrc:$XA, vsrc:$XTi, vsrc:$XB),
                         "xxpermr $XT, $XA, $XB", IIC_VecPerm, []>,
-                        RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">;
+                        RegConstraint<"$XTi = $XT">;
 
   // Vector Splat Immediate Byte
   def XXSPLTIB : X_RD6_IMM8<60, 360, (outs vsrc:$XT), (ins u8imm:$IMM8),
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
index 996b6efb320d..736ba1edcaea 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -52,12 +52,11 @@ PPCSubtarget &PPCSubtarget::initializeSubtargetDependencies(StringRef CPU,
   return *this;
 }
 
-PPCSubtarget::PPCSubtarget(const Triple &TT, const std::string &CPU,
-                           const std::string &TuneCPU, const std::string &FS,
-                           const PPCTargetMachine &TM)
-    : PPCGenSubtargetInfo(TT, CPU, TuneCPU, FS), TargetTriple(TT),
-      IsPPC64(TargetTriple.getArch() == Triple::ppc64 ||
-              TargetTriple.getArch() == Triple::ppc64le),
+PPCSubtarget::PPCSubtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU,
+                           StringRef FS, const PPCTargetMachine &TM)
+    : PPCGenSubtargetInfo(TT, CPU, TuneCPU, FS),
+      IsPPC64(getTargetTriple().getArch() == Triple::ppc64 ||
+              getTargetTriple().getArch() == Triple::ppc64le),
       TM(TM), FrameLowering(initializeSubtargetDependencies(CPU, TuneCPU, FS)),
       InstrInfo(*this), TLInfo(TM, *this) {
   TSInfo = std::make_unique<PPCSelectionDAGInfo>();
@@ -87,10 +86,10 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU,
   // Determine default and user specified characteristics
   std::string CPUName = std::string(CPU);
   if (CPUName.empty() || CPU == "generic") {
-    if (TargetTriple.getSubArch() == Triple::PPCSubArch_spe)
+    if (getTargetTriple().getSubArch() == Triple::PPCSubArch_spe)
       CPUName = "e500";
     else
-      CPUName = std::string(PPC::getNormalizedPPCTargetCPU(TargetTriple));
+      CPUName = std::string(PPC::getNormalizedPPCTargetCPU(getTargetTriple()));
   }
 
   // Determine the CPU to schedule for.
@@ -107,7 +106,7 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU,
   if (IsPPC64 && has64BitSupport())
     Use64BitRegs = true;
 
-  if (TargetTriple.isPPC32SecurePlt())
+  if (getTargetTriple().isPPC32SecurePlt())
     IsSecurePlt = true;
 
   if (HasSPE && IsPPC64)
@@ -126,7 +125,7 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU,
   IsLittleEndian = TM.isLittleEndian();
 
   if (HasAIXSmallLocalExecTLS || HasAIXSmallLocalDynamicTLS) {
-    if (!TargetTriple.isOSAIX() || !IsPPC64)
+    if (!getTargetTriple().isOSAIX() || !IsPPC64)
       report_fatal_error("The aix-small-local-[exec|dynamic]-tls attribute is "
                          "only supported on AIX in "
                          "64-bit mode.\n",
@@ -143,7 +142,7 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU,
                          false);
   }
 
-  if (HasAIXShLibTLSModelOpt && (!TargetTriple.isOSAIX() || !IsPPC64))
+  if (HasAIXShLibTLSModelOpt && (!getTargetTriple().isOSAIX() || !IsPPC64))
     report_fatal_error("The aix-shared-lib-tls-model-opt attribute "
                        "is only supported on AIX in 64-bit mode.\n",
                        false);
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h
index 3c59a475c7eb..c17fca7f70a3 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.h
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -78,9 +78,6 @@ public:
   };
 
 protected:
-  /// TargetTriple - What processor and OS we're targeting.
-  Triple TargetTriple;
-
   /// stackAlignment - The minimum alignment known to hold of the stack frame on
   /// entry to the function and which must be maintained by every function.
   Align StackAlignment;
@@ -119,8 +116,7 @@ public:
   /// This constructor initializes the data members to match that
   /// of the specified triple.
   ///
-  PPCSubtarget(const Triple &TT, const std::string &CPU,
-               const std::string &TuneCPU, const std::string &FS,
+  PPCSubtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU, StringRef FS,
                const PPCTargetMachine &TM);
 
   ~PPCSubtarget() override;
@@ -210,13 +206,11 @@ public:
 
   POPCNTDKind hasPOPCNTD() const { return HasPOPCNTD; }
 
-  const Triple &getTargetTriple() const { return TargetTriple; }
-
-  bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
-  bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
-  bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
+  bool isTargetELF() const { return getTargetTriple().isOSBinFormatELF(); }
+  bool isTargetMachO() const { return getTargetTriple().isOSBinFormatMachO(); }
+  bool isTargetLinux() const { return getTargetTriple().isOSLinux(); }
 
-  bool isAIXABI() const { return TargetTriple.isOSAIX(); }
+  bool isAIXABI() const { return getTargetTriple().isOSAIX(); }
   bool isSVR4ABI() const { return !isAIXABI(); }
   bool isELFv2ABI() const;
 
diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
index b5c6ac111dff..ae92d5eab20c 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -129,7 +129,7 @@ LLVMInitializePowerPCTarget() {
   initializePPCLoopInstrFormPrepPass(PR);
   initializePPCTOCRegDepsPass(PR);
   initializePPCEarlyReturnPass(PR);
-  initializePPCVSXCopyPass(PR);
+  initializePPCVSXWACCCopyPass(PR);
   initializePPCVSXFMAMutatePass(PR);
   initializePPCVSXSwapRemovalPass(PR);
   initializePPCReduceCRLogicalsPass(PR);
@@ -528,7 +528,7 @@ bool PPCPassConfig::addInstSelector() {
     addPass(createPPCCTRLoopsVerify());
 #endif
 
-  addPass(createPPCVSXCopyPass());
+  addPass(createPPCVSXWACCCopyPass());
   return false;
 }
 
diff --git a/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp b/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp
deleted file mode 100644
index 794095cd4376..000000000000
--- a/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp
+++ /dev/null
@@ -1,159 +0,0 @@
-//===-------------- PPCVSXCopy.cpp - VSX Copy Legalization ----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// A pass which deals with the complexity of generating legal VSX register
-// copies to/from register classes which partially overlap with the VSX
-// register file.
-//
-//===----------------------------------------------------------------------===//
-
-#include "PPC.h"
-#include "PPCInstrInfo.h"
-#include "PPCTargetMachine.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/ErrorHandling.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "ppc-vsx-copy"
-
-namespace {
-  // PPCVSXCopy pass - For copies between VSX registers and non-VSX registers
-  // (Altivec and scalar floating-point registers), we need to transform the
-  // copies into subregister copies with other restrictions.
-  struct PPCVSXCopy : public MachineFunctionPass {
-    static char ID;
-    PPCVSXCopy() : MachineFunctionPass(ID) {}
-
-    const TargetInstrInfo *TII;
-
-    bool IsRegInClass(unsigned Reg, const TargetRegisterClass *RC,
-                      MachineRegisterInfo &MRI) {
-      if (Register::isVirtualRegister(Reg)) {
-        return RC->hasSubClassEq(MRI.getRegClass(Reg));
-      } else if (RC->contains(Reg)) {
-        return true;
-      }
-
-      return false;
-    }
-
-    bool IsVSReg(unsigned Reg, MachineRegisterInfo &MRI) {
-      return IsRegInClass(Reg, &PPC::VSRCRegClass, MRI);
-    }
-
-    bool IsVRReg(unsigned Reg, MachineRegisterInfo &MRI) {
-      return IsRegInClass(Reg, &PPC::VRRCRegClass, MRI);
-    }
-
-    bool IsF8Reg(unsigned Reg, MachineRegisterInfo &MRI) {
-      return IsRegInClass(Reg, &PPC::F8RCRegClass, MRI);
-    }
-
-    bool IsVSFReg(unsigned Reg, MachineRegisterInfo &MRI) {
-      return IsRegInClass(Reg, &PPC::VSFRCRegClass, MRI);
-    }
-
-    bool IsVSSReg(unsigned Reg, MachineRegisterInfo &MRI) {
-      return IsRegInClass(Reg, &PPC::VSSRCRegClass, MRI);
-    }
-
-protected:
-    bool processBlock(MachineBasicBlock &MBB) {
-      bool Changed = false;
-
-      MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-      for (MachineInstr &MI : MBB) {
-        if (!MI.isFullCopy())
-          continue;
-
-        MachineOperand &DstMO = MI.getOperand(0);
-        MachineOperand &SrcMO = MI.getOperand(1);
-
-        if ( IsVSReg(DstMO.getReg(), MRI) &&
-            !IsVSReg(SrcMO.getReg(), MRI)) {
-          // This is a copy *to* a VSX register from a non-VSX register.
-          Changed = true;
-
-          const TargetRegisterClass *SrcRC = &PPC::VSLRCRegClass;
-          assert((IsF8Reg(SrcMO.getReg(), MRI) ||
-                  IsVSSReg(SrcMO.getReg(), MRI) ||
-                  IsVSFReg(SrcMO.getReg(), MRI)) &&
-                 "Unknown source for a VSX copy");
-
-          Register NewVReg = MRI.createVirtualRegister(SrcRC);
-          BuildMI(MBB, MI, MI.getDebugLoc(),
-                  TII->get(TargetOpcode::SUBREG_TO_REG), NewVReg)
-              .addImm(1) // add 1, not 0, because there is no implicit clearing
-                         // of the high bits.
-              .add(SrcMO)
-              .addImm(PPC::sub_64);
-
-          // The source of the original copy is now the new virtual register.
-          SrcMO.setReg(NewVReg);
-        } else if (!IsVSReg(DstMO.getReg(), MRI) &&
-                    IsVSReg(SrcMO.getReg(), MRI)) {
-          // This is a copy *from* a VSX register to a non-VSX register.
-          Changed = true;
-
-          const TargetRegisterClass *DstRC = &PPC::VSLRCRegClass;
-          assert((IsF8Reg(DstMO.getReg(), MRI) ||
-                  IsVSFReg(DstMO.getReg(), MRI) ||
-                  IsVSSReg(DstMO.getReg(), MRI)) &&
-                 "Unknown destination for a VSX copy");
-
-          // Copy the VSX value into a new VSX register of the correct subclass.
-          Register NewVReg = MRI.createVirtualRegister(DstRC);
-          BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
-                  NewVReg)
-              .add(SrcMO);
-
-          // Transform the original copy into a subregister extraction copy.
-          SrcMO.setReg(NewVReg);
-          SrcMO.setSubReg(PPC::sub_64);
-        }
-      }
-
-      return Changed;
-    }
-
-public:
-    bool runOnMachineFunction(MachineFunction &MF) override {
-      // If we don't have VSX on the subtarget, don't do anything.
-      const PPCSubtarget &STI = MF.getSubtarget<PPCSubtarget>();
-      if (!STI.hasVSX())
-        return false;
-      TII = STI.getInstrInfo();
-
-      bool Changed = false;
-
-      for (MachineBasicBlock &B : llvm::make_early_inc_range(MF))
-        if (processBlock(B))
-          Changed = true;
-
-      return Changed;
-    }
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      MachineFunctionPass::getAnalysisUsage(AU);
-    }
-  };
-  } // end anonymous namespace
-
-INITIALIZE_PASS(PPCVSXCopy, DEBUG_TYPE,
-                "PowerPC VSX Copy Legalization", false, false)
-
-char PPCVSXCopy::ID = 0;
-FunctionPass*
-llvm::createPPCVSXCopyPass() { return new PPCVSXCopy(); }
diff --git a/llvm/lib/Target/PowerPC/PPCVSXWACCCopy.cpp b/llvm/lib/Target/PowerPC/PPCVSXWACCCopy.cpp
new file mode 100644
index 000000000000..2ec566ddb0b8
--- /dev/null
+++ b/llvm/lib/Target/PowerPC/PPCVSXWACCCopy.cpp
@@ -0,0 +1,182 @@
+//===--------- PPCVSXWACCCopy.cpp - VSX and WACC Copy Legalization --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A pass which deals with the complexity of generating legal VSX register
+// copies to/from register classes which partially overlap with the VSX
+// register file and combines the wacc/wacc_hi copies when needed.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "PPCInstrInfo.h"
+#include "PPCTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-vsx-copy"
+
+namespace {
+// PPCVSXWACCCopy pass - For copies between VSX registers and non-VSX registers
+// (Altivec and scalar floating-point registers), we need to transform the
+// copies into subregister copies with other restrictions.
+struct PPCVSXWACCCopy : public MachineFunctionPass {
+  static char ID;
+  PPCVSXWACCCopy() : MachineFunctionPass(ID) {}
+
+  const TargetInstrInfo *TII;
+
+  bool IsRegInClass(unsigned Reg, const TargetRegisterClass *RC,
+                    MachineRegisterInfo &MRI) {
+    if (Register::isVirtualRegister(Reg)) {
+      return RC->hasSubClassEq(MRI.getRegClass(Reg));
+    } else if (RC->contains(Reg)) {
+      return true;
+    }
+
+    return false;
+  }
+
+  bool IsVSReg(unsigned Reg, MachineRegisterInfo &MRI) {
+    return IsRegInClass(Reg, &PPC::VSRCRegClass, MRI);
+  }
+
+  bool IsVRReg(unsigned Reg, MachineRegisterInfo &MRI) {
+    return IsRegInClass(Reg, &PPC::VRRCRegClass, MRI);
+  }
+
+  bool IsF8Reg(unsigned Reg, MachineRegisterInfo &MRI) {
+    return IsRegInClass(Reg, &PPC::F8RCRegClass, MRI);
+  }
+
+  bool IsVSFReg(unsigned Reg, MachineRegisterInfo &MRI) {
+    return IsRegInClass(Reg, &PPC::VSFRCRegClass, MRI);
+  }
+
+  bool IsVSSReg(unsigned Reg, MachineRegisterInfo &MRI) {
+    return IsRegInClass(Reg, &PPC::VSSRCRegClass, MRI);
+  }
+
+protected:
+  bool processBlock(MachineBasicBlock &MBB) {
+    bool Changed = false;
+
+    MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+    for (MachineInstr &MI : MBB) {
+      if (!MI.isFullCopy())
+        continue;
+
+      MachineOperand &DstMO = MI.getOperand(0);
+      MachineOperand &SrcMO = MI.getOperand(1);
+
+      if (IsVSReg(DstMO.getReg(), MRI) && !IsVSReg(SrcMO.getReg(), MRI)) {
+        // This is a copy *to* a VSX register from a non-VSX register.
+        Changed = true;
+
+        const TargetRegisterClass *SrcRC = &PPC::VSLRCRegClass;
+        assert((IsF8Reg(SrcMO.getReg(), MRI) || IsVSSReg(SrcMO.getReg(), MRI) ||
+                IsVSFReg(SrcMO.getReg(), MRI)) &&
+               "Unknown source for a VSX copy");
+
+        Register NewVReg = MRI.createVirtualRegister(SrcRC);
+        BuildMI(MBB, MI, MI.getDebugLoc(),
+                TII->get(TargetOpcode::SUBREG_TO_REG), NewVReg)
+            .addImm(1) // add 1, not 0, because there is no implicit clearing
+                       // of the high bits.
+            .add(SrcMO)
+            .addImm(PPC::sub_64);
+
+        // The source of the original copy is now the new virtual register.
+        SrcMO.setReg(NewVReg);
+      } else if (!IsVSReg(DstMO.getReg(), MRI) &&
+                 IsVSReg(SrcMO.getReg(), MRI)) {
+        // This is a copy *from* a VSX register to a non-VSX register.
+        Changed = true;
+
+        const TargetRegisterClass *DstRC = &PPC::VSLRCRegClass;
+        assert((IsF8Reg(DstMO.getReg(), MRI) || IsVSFReg(DstMO.getReg(), MRI) ||
+                IsVSSReg(DstMO.getReg(), MRI)) &&
+               "Unknown destination for a VSX copy");
+
+        // Copy the VSX value into a new VSX register of the correct subclass.
+        Register NewVReg = MRI.createVirtualRegister(DstRC);
+        BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
+                NewVReg)
+            .add(SrcMO);
+
+        // Transform the original copy into a subregister extraction copy.
+        SrcMO.setReg(NewVReg);
+        SrcMO.setSubReg(PPC::sub_64);
+      } else if (IsRegInClass(DstMO.getReg(), &PPC::WACC_HIRCRegClass, MRI) &&
+                 IsRegInClass(SrcMO.getReg(), &PPC::WACCRCRegClass, MRI)) {
+        // Matches the pattern:
+        //   %a:waccrc = COPY %b.sub_wacc_hi:dmrrc
+        //   %c:wacc_hirc = COPY %a:waccrc
+        // And replaces it with:
+        //   %c:wacc_hirc = COPY %b.sub_wacc_hi:dmrrc
+        MachineInstr *DefMI = MRI.getUniqueVRegDef(SrcMO.getReg());
+        if (!DefMI || !DefMI->isCopy())
+          continue;
+
+        MachineOperand &OrigSrc = DefMI->getOperand(1);
+
+        if (!IsRegInClass(OrigSrc.getReg(), &PPC::DMRRCRegClass, MRI))
+          continue;
+
+        if (OrigSrc.getSubReg() != PPC::sub_wacc_hi)
+          continue;
+
+        // Rewrite the second copy to use the original register's subreg
+        SrcMO.setReg(OrigSrc.getReg());
+        SrcMO.setSubReg(PPC::sub_wacc_hi);
+        Changed = true;
+
+        // Remove the intermediate copy if safe
+        if (MRI.use_nodbg_empty(DefMI->getOperand(0).getReg()))
+          DefMI->eraseFromParent();
+      }
+    }
+
+    return Changed;
+  }
+
+public:
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    // If we don't have VSX on the subtarget, don't do anything.
+    const PPCSubtarget &STI = MF.getSubtarget<PPCSubtarget>();
+    if (!STI.hasVSX())
+      return false;
+    TII = STI.getInstrInfo();
+
+    bool Changed = false;
+
+    for (MachineBasicBlock &B : llvm::make_early_inc_range(MF))
+      if (processBlock(B))
+        Changed = true;
+
+    return Changed;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+} // end anonymous namespace
+
+INITIALIZE_PASS(PPCVSXWACCCopy, DEBUG_TYPE, "PowerPC VSX Copy Legalization",
+                false, false)
+
+char PPCVSXWACCCopy::ID = 0;
+FunctionPass *llvm::createPPCVSXWACCCopyPass() { return new PPCVSXWACCCopy(); }
diff --git a/llvm/lib/Target/PowerPC/README_P9.txt b/llvm/lib/Target/PowerPC/README_P9.txt
index ee1ea735acad..208c8abfdc5f 100644
--- a/llvm/lib/Target/PowerPC/README_P9.txt
+++ b/llvm/lib/Target/PowerPC/README_P9.txt
@@ -224,22 +224,22 @@ VSX:
   . isCommutable = 1
     // xsmaddqp
     [(set f128:$vT, (fma f128:$vA, f128:$vB, f128:$vTi))]>,
-    RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+    RegConstraint<"$vTi = $vT">,
     AltVSXFMARel;
 
     // xsmsubqp
     [(set f128:$vT, (fma f128:$vA, f128:$vB, (fneg f128:$vTi)))]>,
-    RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+    RegConstraint<"$vTi = $vT">,
     AltVSXFMARel;
 
     // xsnmaddqp
     [(set f128:$vT, (fneg (fma f128:$vA, f128:$vB, f128:$vTi)))]>,
-    RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+    RegConstraint<"$vTi = $vT">,
     AltVSXFMARel;
 
     // xsnmsubqp
     [(set f128:$vT, (fneg (fma f128:$vA, f128:$vB, (fneg f128:$vTi))))]>,
-    RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+    RegConstraint<"$vTi = $vT">,
     AltVSXFMARel;
 
 - Round to Odd of QP (Negative) Multiply-{Add/Subtract}:
@@ -276,22 +276,22 @@ VSX:
     . isCommutable = 1
       // xsmaddqpo
       [(set f128:$vT, (PPCfmarto f128:$vA, f128:$vB, f128:$vTi))]>,
-      RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+      RegConstraint<"$vTi = $vT">,
       AltVSXFMARel;
 
       // xsmsubqpo
       [(set f128:$vT, (PPCfmarto f128:$vA, f128:$vB, (fneg f128:$vTi)))]>,
-      RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+      RegConstraint<"$vTi = $vT">,
       AltVSXFMARel;
 
       // xsnmaddqpo
       [(set f128:$vT, (fneg (PPCfmarto f128:$vA, f128:$vB, f128:$vTi)))]>,
-      RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+      RegConstraint<"$vTi = $vT">,
       AltVSXFMARel;
 
       // xsnmsubqpo
       [(set f128:$vT, (fneg (PPCfmarto f128:$vA, f128:$vB, (fneg f128:$vTi))))]>,
-      RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+      RegConstraint<"$vTi = $vT">,
       AltVSXFMARel;
 
 - QP Compare Ordered/Unordered: xscmpoqp xscmpuqp
@@ -405,7 +405,7 @@ Fixed Point Facility:
 
     But how to map to it??
     [(set v1f128:$XT, (insertelement v1f128:$XTi, f128:$XB, i4:$UIMM))]>,
-    RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+    RegConstraint<"$XTi = $XT">,
 
   . Or use intrinsic?
     (set v1f128:$XT, (int_ppc_vsx_xxinsertw v1f128:$XTi, f128:$XB, i4:$UIMM))
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 9ce44d0ff7fd..cd8392849ac4 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -121,7 +121,7 @@ class RISCVAsmParser : public MCTargetAsmParser {
 
   bool parseVTypeToken(const AsmToken &Tok, VTypeState &State, unsigned &Sew,
                        unsigned &Lmul, bool &Fractional, bool &TailAgnostic,
-                       bool &MaskAgnostic);
+                       bool &MaskAgnostic, bool &AltFmt);
   bool generateVTypeError(SMLoc ErrorLoc);
 
   bool generateXSfmmVTypeError(SMLoc ErrorLoc);
@@ -2261,14 +2261,23 @@ ParseStatus RISCVAsmParser::parseJALOffset(OperandVector &Operands) {
 bool RISCVAsmParser::parseVTypeToken(const AsmToken &Tok, VTypeState &State,
                                      unsigned &Sew, unsigned &Lmul,
                                      bool &Fractional, bool &TailAgnostic,
-                                     bool &MaskAgnostic) {
+                                     bool &MaskAgnostic, bool &AltFmt) {
   if (Tok.isNot(AsmToken::Identifier))
     return true;
 
   StringRef Identifier = Tok.getIdentifier();
   if (State < VTypeState::SeenSew && Identifier.consume_front("e")) {
-    if (Identifier.getAsInteger(10, Sew))
-      return true;
+    if (Identifier.getAsInteger(10, Sew)) {
+      if (Identifier == "16alt") {
+        AltFmt = true;
+        Sew = 16;
+      } else if (Identifier == "8alt") {
+        AltFmt = true;
+        Sew = 8;
+      } else {
+        return true;
+      }
+    }
     if (!RISCVVType::isValidSEW(Sew))
       return true;
 
@@ -2340,11 +2349,12 @@ ParseStatus RISCVAsmParser::parseVTypeI(OperandVector &Operands) {
   bool Fractional = false;
   bool TailAgnostic = false;
   bool MaskAgnostic = false;
+  bool AltFmt = false;
 
   VTypeState State = VTypeState::SeenNothingYet;
   do {
     if (parseVTypeToken(getTok(), State, Sew, Lmul, Fractional, TailAgnostic,
-                        MaskAgnostic)) {
+                        MaskAgnostic, AltFmt)) {
       // The first time, errors return NoMatch rather than Failure
       if (State == VTypeState::SeenNothingYet)
         return ParseStatus::NoMatch;
@@ -2370,12 +2380,17 @@ ParseStatus RISCVAsmParser::parseVTypeI(OperandVector &Operands) {
   }
 
   unsigned VTypeI =
-      RISCVVType::encodeVTYPE(VLMUL, Sew, TailAgnostic, MaskAgnostic);
+      RISCVVType::encodeVTYPE(VLMUL, Sew, TailAgnostic, MaskAgnostic, AltFmt);
   Operands.push_back(RISCVOperand::createVType(VTypeI, S));
   return ParseStatus::Success;
 }
 
 bool RISCVAsmParser::generateVTypeError(SMLoc ErrorLoc) {
+  if (STI->hasFeature(RISCV::FeatureStdExtZvfbfa))
+    return Error(
+        ErrorLoc,
+        "operand must be "
+        "e[8|8alt|16|16alt|32|64],m[1|2|4|8|f2|f4|f8],[ta|tu],[ma|mu]");
   return Error(
       ErrorLoc,
       "operand must be "
diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt
index 47329b2c2f4d..0ff178e1f195 100644
--- a/llvm/lib/Target/RISCV/CMakeLists.txt
+++ b/llvm/lib/Target/RISCV/CMakeLists.txt
@@ -7,7 +7,8 @@ tablegen(LLVM RISCVGenAsmWriter.inc -gen-asm-writer)
 tablegen(LLVM RISCVGenCompressInstEmitter.inc -gen-compress-inst-emitter)
 tablegen(LLVM RISCVGenMacroFusion.inc -gen-macro-fusion-pred)
 tablegen(LLVM RISCVGenDAGISel.inc -gen-dag-isel)
-tablegen(LLVM RISCVGenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM RISCVGenDisassemblerTables.inc -gen-disassembler
+              --specialize-decoders-per-bitwidth)
 tablegen(LLVM RISCVGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM RISCVGenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM RISCVGenMCPseudoLowering.inc -gen-pseudo-lowering)
@@ -87,6 +88,7 @@ add_llvm_target(RISCVCodeGen
   GlobalISel
   IPO
   MC
+  Passes
   RISCVDesc
   RISCVInfo
   Scalar
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index dbb16fce8390..89df9d82f878 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -46,8 +46,6 @@ public:
                               raw_ostream &CStream) const override;
 
 private:
-  void addSPOperands(MCInst &MI) const;
-
   DecodeStatus getInstruction48(MCInst &Instr, uint64_t &Size,
                                 ArrayRef<uint8_t> Bytes, uint64_t Address,
                                 raw_ostream &CStream) const;
@@ -196,6 +194,12 @@ static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, uint32_t RegNo,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeSPRegisterClass(MCInst &Inst,
+                                          const MCDisassembler *Decoder) {
+  Inst.addOperand(MCOperand::createReg(RISCV::X2));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeGPRNoX0RegisterClass(MCInst &Inst, uint32_t RegNo,
                                                uint64_t Address,
                                                const MCDisassembler *Decoder) {
@@ -558,7 +562,7 @@ static DecodeStatus decodeXqccmpRlistS0(MCInst &Inst, uint32_t Imm,
   return decodeZcmpRlist(Inst, Imm, Address, Decoder);
 }
 
-static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn,
+static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint16_t Insn,
                                         uint64_t Address,
                                         const MCDisassembler *Decoder) {
   uint32_t Rs1 = fieldFromInstruction(Insn, 7, 5);
@@ -600,15 +604,6 @@ static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn,
 
 #include "RISCVGenDisassemblerTables.inc"
 
-// Add implied SP operand for C.*SP compressed instructions. The SP operand
-// isn't explicitly encoded in the instruction.
-void RISCVDisassembler::addSPOperands(MCInst &MI) const {
-  const MCInstrDesc &MCID = MCII->get(MI.getOpcode());
-  for (unsigned i = 0; i < MCID.getNumOperands(); i++)
-    if (MCID.operands()[i].RegClass == RISCV::SPRegClassID)
-      MI.insert(MI.begin() + i, MCOperand::createReg(RISCV::X2));
-}
-
 namespace {
 
 struct DecoderListEntry {
@@ -656,6 +651,13 @@ static constexpr FeatureBitset XSfSystemGroup = {
     RISCV::FeatureVendorXSiFivecflushdlone,
 };
 
+static constexpr FeatureBitset XMIPSGroup = {
+    RISCV::FeatureVendorXMIPSLSP,
+    RISCV::FeatureVendorXMIPSCMov,
+    RISCV::FeatureVendorXMIPSCBOP,
+    RISCV::FeatureVendorXMIPSEXECTL,
+};
+
 static constexpr FeatureBitset XTHeadGroup = {
     RISCV::FeatureVendorXTHeadBa,      RISCV::FeatureVendorXTHeadBb,
     RISCV::FeatureVendorXTHeadBs,      RISCV::FeatureVendorXTHeadCondMov,
@@ -684,13 +686,7 @@ static constexpr DecoderListEntry DecoderList32[]{
     {DecoderTableXSfvector32, XSfVectorGroup, "SiFive vector extensions"},
     {DecoderTableXSfsystem32, XSfSystemGroup, "SiFive system extensions"},
     {DecoderTableXSfcease32, {RISCV::FeatureVendorXSfcease}, "SiFive sf.cease"},
-    {DecoderTableXmipslsp32, {RISCV::FeatureVendorXMIPSLSP}, "MIPS mips.lsp"},
-    {DecoderTableXmipscmov32,
-     {RISCV::FeatureVendorXMIPSCMov},
-     "MIPS mips.ccmov"},
-    {DecoderTableXmipscbop32,
-     {RISCV::FeatureVendorXMIPSCBOP},
-     "MIPS mips.pref"},
+    {DecoderTableXMIPS32, XMIPSGroup, "Mips extensions"},
     {DecoderTableXAndes32, XAndesGroup, "Andes extensions"},
     {DecoderTableXSMT32, XSMTGroup, "SpacemiT extensions"},
     // Standard Extensions
@@ -700,6 +696,14 @@ static constexpr DecoderListEntry DecoderList32[]{
     {DecoderTableZdinxRV32Only32, {}, "RV32-only Zdinx (Double in Integer)"},
 };
 
+namespace {
+// Define bitwidths for various types used to instantiate the decoder.
+template <> constexpr uint32_t InsnBitWidth<uint16_t> = 16;
+template <> constexpr uint32_t InsnBitWidth<uint32_t> = 32;
+// Use uint64_t to represent 48 bit instructions.
+template <> constexpr uint32_t InsnBitWidth<uint64_t> = 48;
+} // namespace
+
 DecodeStatus RISCVDisassembler::getInstruction32(MCInst &MI, uint64_t &Size,
                                                  ArrayRef<uint8_t> Bytes,
                                                  uint64_t Address,
@@ -710,9 +714,7 @@ DecodeStatus RISCVDisassembler::getInstruction32(MCInst &MI, uint64_t &Size,
   }
   Size = 4;
 
-  // Use uint64_t to match getInstruction48. decodeInstruction is templated
-  // on the Insn type.
-  uint64_t Insn = support::endian::read32le(Bytes.data());
+  uint32_t Insn = support::endian::read32le(Bytes.data());
 
   for (const DecoderListEntry &Entry : DecoderList32) {
     if (!Entry.haveContainedFeatures(STI.getFeatureBits()))
@@ -758,9 +760,7 @@ DecodeStatus RISCVDisassembler::getInstruction16(MCInst &MI, uint64_t &Size,
   }
   Size = 2;
 
-  // Use uint64_t to match getInstruction48. decodeInstruction is templated
-  // on the Insn type.
-  uint64_t Insn = support::endian::read16le(Bytes.data());
+  uint16_t Insn = support::endian::read16le(Bytes.data());
 
   for (const DecoderListEntry &Entry : DecoderList16) {
     if (!Entry.haveContainedFeatures(STI.getFeatureBits()))
@@ -769,12 +769,8 @@ DecodeStatus RISCVDisassembler::getInstruction16(MCInst &MI, uint64_t &Size,
     LLVM_DEBUG(dbgs() << "Trying " << Entry.Desc << " table:\n");
     DecodeStatus Result =
         decodeInstruction(Entry.Table, MI, Insn, Address, this, STI);
-    if (Result == MCDisassembler::Fail)
-      continue;
-
-    addSPOperands(MI);
-
-    return Result;
+    if (Result != MCDisassembler::Fail)
+      return Result;
   }
 
   return MCDisassembler::Fail;
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
index 51ea3fc5f677..7df1b7e58000 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
@@ -1158,8 +1158,8 @@ bool RISCVInstructionSelector::selectAddr(MachineInstr &MI,
 
   switch (TM.getCodeModel()) {
   default: {
-    reportGISelFailure(const_cast<MachineFunction &>(*MF), *TPC, *MORE,
-                       getName(), "Unsupported code model for lowering", MI);
+    reportGISelFailure(*MF, *TPC, *MORE, getName(),
+                       "Unsupported code model for lowering", MI);
     return false;
   }
   case CodeModel::Small: {
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index e88f33d6859e..564657ac65fd 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -26,6 +26,8 @@
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsRISCV.h"
 #include "llvm/IR/Type.h"
 
 using namespace llvm;
@@ -152,7 +154,8 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
   getActionDefinitionsBuilder({G_SADDO, G_SSUBO}).minScalar(0, sXLen).lower();
 
   // TODO: Use Vector Single-Width Saturating Instructions for vector types.
-  getActionDefinitionsBuilder({G_UADDSAT, G_SADDSAT, G_USUBSAT, G_SSUBSAT})
+  getActionDefinitionsBuilder(
+      {G_UADDSAT, G_SADDSAT, G_USUBSAT, G_SSUBSAT, G_SSHLSAT, G_USHLSAT})
       .lower();
 
   getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR})
@@ -485,6 +488,10 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
       .minScalar(ST.hasStdExtZbb(), 0, sXLen)
       .lower();
 
+  getActionDefinitionsBuilder({G_ABDS, G_ABDU})
+      .minScalar(ST.hasStdExtZbb(), 0, sXLen)
+      .lower();
+
   getActionDefinitionsBuilder({G_UMAX, G_UMIN, G_SMAX, G_SMIN})
       .legalFor(ST.hasStdExtZbb(), {sXLen})
       .minScalar(ST.hasStdExtZbb(), 0, sXLen)
@@ -692,6 +699,16 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
       .customIf(all(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST),
                     typeIsLegalIntOrFPVec(1, IntOrFPVecTys, ST)));
 
+  getActionDefinitionsBuilder(G_ATOMICRMW_ADD)
+      .legalFor(ST.hasStdExtA(), {{sXLen, p0}})
+      .libcallFor(!ST.hasStdExtA(), {{s8, p0}, {s16, p0}, {s32, p0}, {s64, p0}})
+      .clampScalar(0, sXLen, sXLen);
+
+  getActionDefinitionsBuilder(G_ATOMICRMW_SUB)
+      .libcallFor(!ST.hasStdExtA(), {{s8, p0}, {s16, p0}, {s32, p0}, {s64, p0}})
+      .clampScalar(0, sXLen, sXLen)
+      .lower();
+
   getLegacyLegalizerInfo().computeTables();
   verify(*ST.getInstrInfo());
 }
@@ -729,6 +746,9 @@ bool RISCVLegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
     MI.eraseFromParent();
     return true;
   }
+  case Intrinsic::riscv_masked_atomicrmw_add:
+  case Intrinsic::riscv_masked_atomicrmw_sub:
+    return true;
   }
 }
 
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
index 543c4c5ddfc9..37fe32531800 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
@@ -36,6 +36,12 @@ RISCVTargetELFStreamer::RISCVTargetELFStreamer(MCStreamer &S,
   setFlagsFromFeatures(STI);
 }
 
+RISCVELFStreamer::RISCVELFStreamer(MCContext &C,
+                                   std::unique_ptr<MCAsmBackend> MAB,
+                                   std::unique_ptr<MCObjectWriter> MOW,
+                                   std::unique_ptr<MCCodeEmitter> MCE)
+    : MCELFStreamer(C, std::move(MAB), std::move(MOW), std::move(MCE)) {}
+
 RISCVELFStreamer &RISCVTargetELFStreamer::getStreamer() {
   return static_cast<RISCVELFStreamer &>(Streamer);
 }
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
index 98948cd3e949..26da2441d4ae 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
@@ -28,8 +28,7 @@ class RISCVELFStreamer : public MCELFStreamer {
 public:
   RISCVELFStreamer(MCContext &C, std::unique_ptr<MCAsmBackend> MAB,
                    std::unique_ptr<MCObjectWriter> MOW,
-                   std::unique_ptr<MCCodeEmitter> MCE)
-      : MCELFStreamer(C, std::move(MAB), std::move(MOW), std::move(MCE)) {}
+                   std::unique_ptr<MCCodeEmitter> MCE);
 
   void changeSection(MCSection *Section, uint32_t Subsection) override;
   void emitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
index b0c27ce6010f..50f5a5d09a69 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
@@ -216,9 +216,12 @@ void RISCVInstPrinter::printVTypeI(const MCInst *MI, unsigned OpNo,
                                    const MCSubtargetInfo &STI, raw_ostream &O) {
   unsigned Imm = MI->getOperand(OpNo).getImm();
   // Print the raw immediate for reserved values: vlmul[2:0]=4, vsew[2:0]=0b1xx,
-  // or non-zero in bits 8 and above.
+  // altfmt=1 without zvfbfa extension, or non-zero in bits 9 and above.
   if (RISCVVType::getVLMUL(Imm) == RISCVVType::VLMUL::LMUL_RESERVED ||
-      RISCVVType::getSEW(Imm) > 64 || (Imm >> 8) != 0) {
+      RISCVVType::getSEW(Imm) > 64 ||
+      (RISCVVType::isAltFmt(Imm) &&
+       !STI.hasFeature(RISCV::FeatureStdExtZvfbfa)) ||
+      (Imm >> 9) != 0) {
     O << formatImm(Imm);
     return;
   }
diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index 83566b1c5778..66ca43604670 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -126,7 +126,7 @@ private:
   void LowerPATCHABLE_TAIL_CALL(const MachineInstr *MI);
   void emitSled(const MachineInstr *MI, SledKind Kind);
 
-  bool lowerToMCInst(const MachineInstr *MI, MCInst &OutMI);
+  void lowerToMCInst(const MachineInstr *MI, MCInst &OutMI);
 };
 }
 
@@ -329,12 +329,17 @@ void RISCVAsmPrinter::emitInstruction(const MachineInstr *MI) {
   case TargetOpcode::STATEPOINT:
     return LowerSTATEPOINT(*OutStreamer, SM, *MI);
   case TargetOpcode::PATCHABLE_FUNCTION_ENTER: {
-    // patchable-function-entry is handled in lowerToMCInst
-    // Therefore, we break out of the switch statement if we encounter it here.
     const Function &F = MI->getParent()->getParent()->getFunction();
-    if (F.hasFnAttribute("patchable-function-entry"))
-      break;
-
+    if (F.hasFnAttribute("patchable-function-entry")) {
+      unsigned Num;
+      [[maybe_unused]] bool Result =
+          F.getFnAttribute("patchable-function-entry")
+              .getValueAsString()
+              .getAsInteger(10, Num);
+      assert(!Result && "Enforced by the verifier");
+      emitNops(Num);
+      return;
+    }
     LowerPATCHABLE_FUNCTION_ENTER(MI);
     return;
   }
@@ -347,8 +352,8 @@ void RISCVAsmPrinter::emitInstruction(const MachineInstr *MI) {
   }
 
   MCInst OutInst;
-  if (!lowerToMCInst(MI, OutInst))
-    EmitToStreamer(*OutStreamer, OutInst);
+  lowerToMCInst(MI, OutInst);
+  EmitToStreamer(*OutStreamer, OutInst);
 }
 
 bool RISCVAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
@@ -1174,9 +1179,9 @@ static bool lowerRISCVVMachineInstrToMCInst(const MachineInstr *MI,
   return true;
 }
 
-bool RISCVAsmPrinter::lowerToMCInst(const MachineInstr *MI, MCInst &OutMI) {
+void RISCVAsmPrinter::lowerToMCInst(const MachineInstr *MI, MCInst &OutMI) {
   if (lowerRISCVVMachineInstrToMCInst(MI, OutMI, STI))
-    return false;
+    return;
 
   OutMI.setOpcode(MI->getOpcode());
 
@@ -1185,23 +1190,6 @@ bool RISCVAsmPrinter::lowerToMCInst(const MachineInstr *MI, MCInst &OutMI) {
     if (lowerOperand(MO, MCOp))
       OutMI.addOperand(MCOp);
   }
-
-  switch (OutMI.getOpcode()) {
-  case TargetOpcode::PATCHABLE_FUNCTION_ENTER: {
-    const Function &F = MI->getParent()->getParent()->getFunction();
-    if (F.hasFnAttribute("patchable-function-entry")) {
-      unsigned Num;
-      if (F.getFnAttribute("patchable-function-entry")
-              .getValueAsString()
-              .getAsInteger(10, Num))
-        return false;
-      emitNops(Num);
-      return true;
-    }
-    break;
-  }
-  }
-  return false;
 }
 
 void RISCVAsmPrinter::emitMachineConstantPoolValue(
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 3b738e4cc11a..063963d4ec36 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -680,6 +680,13 @@ def FeatureStdExtV
                      [FeatureStdExtZvl128b, FeatureStdExtZve64d]>,
       RISCVExtensionBitmask<0, 21>;
 
+def FeatureStdExtZvfbfa
+    : RISCVExperimentalExtension<0, 1, "Additional BF16 vector compute support",
+                                 [FeatureStdExtZve32f, FeatureStdExtZfbfmin]>;
+def HasStdExtZvfbfa : Predicate<"Subtarget->hasStdExtZvfbfa()">,
+                      AssemblerPredicate<(all_of FeatureStdExtZvfbfa),
+                          "'Zvfbfa' (Additional BF16 vector compute support)">;
+
 def FeatureStdExtZvfbfmin
     : RISCVExtension<1, 0, "Vector BF16 Converts", [FeatureStdExtZve32f]>;
 def HasStdExtZvfbfmin : Predicate<"Subtarget->hasStdExtZvfbfmin()">,
@@ -1396,20 +1403,27 @@ def HasVendorXMIPSCMov
       AssemblerPredicate<(all_of FeatureVendorXMIPSCMov),
                          "'Xmipscmov' ('mips.ccmov' instruction)">;
 def UseCCMovInsn : Predicate<"Subtarget->useCCMovInsn()">;
+
 def FeatureVendorXMIPSLSP
     : RISCVExtension<1, 0, "MIPS optimization for hardware load-store bonding">;
 def HasVendorXMIPSLSP
     : Predicate<"Subtarget->hasVendorXMIPSLSP()">,
       AssemblerPredicate<(all_of FeatureVendorXMIPSLSP),
                          "'Xmipslsp' (load and store pair instructions)">;
-def FeatureVendorXMIPSCBOP
-    : RISCVExtension<1, 0, "MIPS Software Prefetch">;
+
+def FeatureVendorXMIPSCBOP : RISCVExtension<1, 0, "MIPS Software Prefetch">;
 def HasVendorXMIPSCBOP
     : Predicate<"Subtarget->hasVendorXMIPSCBOP()">,
       AssemblerPredicate<(all_of FeatureVendorXMIPSCBOP),
                          "'Xmipscbop' (MIPS hardware prefetch)">;
 def NoVendorXMIPSCBOP : Predicate<"!Subtarget->hasVendorXMIPSCBOP()">;
 
+def FeatureVendorXMIPSEXECTL : RISCVExtension<1, 0, "MIPS execution control">;
+def HasVendorXMIPSEXECTL
+    : Predicate<"Subtarget->hasVendorXMIPSEXT()">,
+      AssemblerPredicate<(all_of FeatureVendorXMIPSEXECTL),
+                         "'Xmipsexectl' (MIPS execution control)">;
+
 // WCH / Nanjing Qinheng Microelectronics Extension(s)
 
 def FeatureVendorXwchc
@@ -1668,7 +1682,7 @@ def IsRV32 : Predicate<"!Subtarget->is64Bit()">,
                                 "RV32I Base Instruction Set">;
 
 defvar RV32 = DefaultMode;
-def RV64           : HwMode<"+64bit", [IsRV64]>;
+def RV64 : HwMode<[IsRV64]>;
 
 def FeatureRelax
     : SubtargetFeature<"relax", "EnableLinkerRelax", "true",
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 9fc0d815ceee..06ce91771c9e 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -106,8 +106,14 @@ static void emitSCSPrologue(MachineFunction &MF, MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MI,
                             const DebugLoc &DL) {
   const auto &STI = MF.getSubtarget<RISCVSubtarget>();
+  // We check Zimop instead of (Zimop || Zcmop) to determine whether HW shadow
+  // stack is available despite the fact that sspush/sspopchk both have a
+  // compressed form, because if only Zcmop is available, we would need to
+  // reserve X5 due to c.sspopchk only takes X5 and we currently do not support
+  // using X5 as the return address register.
+  // However, we can still aggressively use c.sspush x1 if zcmop is available.
   bool HasHWShadowStack = MF.getFunction().hasFnAttribute("hw-shadow-stack") &&
-                          STI.hasStdExtZicfiss();
+                          STI.hasStdExtZimop();
   bool HasSWShadowStack =
       MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack);
   if (!HasHWShadowStack && !HasSWShadowStack)
@@ -124,7 +130,12 @@ static void emitSCSPrologue(MachineFunction &MF, MachineBasicBlock &MBB,
 
   const RISCVInstrInfo *TII = STI.getInstrInfo();
   if (HasHWShadowStack) {
-    BuildMI(MBB, MI, DL, TII->get(RISCV::SSPUSH)).addReg(RAReg);
+    if (STI.hasStdExtZcmop()) {
+      static_assert(RAReg == RISCV::X1, "C.SSPUSH only accepts X1");
+      BuildMI(MBB, MI, DL, TII->get(RISCV::PseudoMOP_C_SSPUSH));
+    } else {
+      BuildMI(MBB, MI, DL, TII->get(RISCV::PseudoMOP_SSPUSH)).addReg(RAReg);
+    }
     return;
   }
 
@@ -172,7 +183,7 @@ static void emitSCSEpilogue(MachineFunction &MF, MachineBasicBlock &MBB,
                             const DebugLoc &DL) {
   const auto &STI = MF.getSubtarget<RISCVSubtarget>();
   bool HasHWShadowStack = MF.getFunction().hasFnAttribute("hw-shadow-stack") &&
-                          STI.hasStdExtZicfiss();
+                          STI.hasStdExtZimop();
   bool HasSWShadowStack =
       MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack);
   if (!HasHWShadowStack && !HasSWShadowStack)
@@ -186,7 +197,7 @@ static void emitSCSEpilogue(MachineFunction &MF, MachineBasicBlock &MBB,
 
   const RISCVInstrInfo *TII = STI.getInstrInfo();
   if (HasHWShadowStack) {
-    BuildMI(MBB, MI, DL, TII->get(RISCV::SSPOPCHK)).addReg(RAReg);
+    BuildMI(MBB, MI, DL, TII->get(RISCV::PseudoMOP_SSPOPCHK)).addReg(RAReg);
     return;
   }
 
diff --git a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
index 80a48c5ec11f..52dc53e4545e 100644
--- a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
@@ -561,7 +561,7 @@ bool RISCVGatherScatterLowering::tryCreateStridedLoadStore(IntrinsicInst *II) {
     EVL = Builder.CreateElementCount(
         Builder.getInt32Ty(), cast<VectorType>(DataType)->getElementCount());
 
-  CallInst *Call;
+  Value *Call;
 
   if (!StoreVal) {
     Call = Builder.CreateIntrinsic(
@@ -571,8 +571,7 @@ bool RISCVGatherScatterLowering::tryCreateStridedLoadStore(IntrinsicInst *II) {
 
     // Merge llvm.masked.gather's passthru
     if (II->getIntrinsicID() == Intrinsic::masked_gather)
-      Call = Builder.CreateIntrinsic(Intrinsic::vp_select, {DataType},
-                                     {Mask, Call, II->getArgOperand(3), EVL});
+      Call = Builder.CreateSelect(Mask, Call, II->getArgOperand(3));
   } else
     Call = Builder.CreateIntrinsic(
         Intrinsic::experimental_vp_strided_store,
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index f9f35f66319b..c7f15415ebb9 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -819,49 +819,6 @@ bool RISCVDAGToDAGISel::trySignedBitfieldInsertInSign(SDNode *Node) {
   return false;
 }
 
-// (xor X, (and (xor X, C1), C2))
-// -> (qc.insbi X, (C1 >> ShAmt), Width, ShAmt)
-// where C2 is a shifted mask with width=Width and shift=ShAmt
-bool RISCVDAGToDAGISel::tryBitfieldInsertOpFromXor(SDNode *Node) {
-
-  if (!Subtarget->hasVendorXqcibm())
-    return false;
-
-  using namespace SDPatternMatch;
-
-  SDValue X;
-  APInt CImm, CMask;
-  if (!sd_match(
-          Node,
-          m_Xor(m_Value(X),
-                m_OneUse(m_And(m_OneUse(m_Xor(m_Deferred(X), m_ConstInt(CImm))),
-                               m_ConstInt(CMask))))))
-    return false;
-
-  unsigned Width, ShAmt;
-  if (!CMask.isShiftedMask(ShAmt, Width))
-    return false;
-
-  int64_t Imm = CImm.getSExtValue();
-  Imm >>= ShAmt;
-
-  SDLoc DL(Node);
-  SDValue ImmNode;
-  auto Opc = RISCV::QC_INSB;
-
-  if (isInt<5>(Imm)) {
-    Opc = RISCV::QC_INSBI;
-    ImmNode = CurDAG->getSignedTargetConstant(Imm, DL, MVT::i32);
-  } else {
-    ImmNode = selectImm(CurDAG, DL, MVT::i32, Imm, *Subtarget);
-  }
-  SDValue Ops[] = {X, ImmNode, CurDAG->getTargetConstant(Width, DL, MVT::i32),
-                   CurDAG->getTargetConstant(ShAmt, DL, MVT::i32)};
-  ReplaceNode(Node, CurDAG->getMachineNode(Opc, DL, MVT::i32, Ops));
-
-  return true;
-}
-
 bool RISCVDAGToDAGISel::tryUnsignedBitfieldExtract(SDNode *Node,
                                                    const SDLoc &DL, MVT VT,
                                                    SDValue X, unsigned Msb,
@@ -1095,7 +1052,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
   SDLoc DL(Node);
   MVT VT = Node->getSimpleValueType(0);
 
-  bool HasBitTest = Subtarget->hasStdExtZbs() || Subtarget->hasVendorXTHeadBs();
+  bool HasBitTest = Subtarget->hasBEXTILike();
 
   switch (Opcode) {
   case ISD::Constant: {
@@ -1442,9 +1399,6 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     if (tryShrinkShlLogicImm(Node))
       return;
 
-    if (tryBitfieldInsertOpFromXor(Node))
-      return;
-
     break;
   case ISD::AND: {
     auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1));
@@ -2951,6 +2905,65 @@ static bool isWorthFoldingAdd(SDValue Add) {
   return true;
 }
 
+bool isRegImmLoadOrStore(SDNode *User, SDValue Add) {
+  switch (User->getOpcode()) {
+  default:
+    return false;
+  case ISD::LOAD:
+  case RISCVISD::LD_RV32:
+  case ISD::ATOMIC_LOAD:
+    break;
+  case ISD::STORE:
+    // Don't allow stores of Add. It must only be used as the address.
+    if (cast<StoreSDNode>(User)->getValue() == Add)
+      return false;
+    break;
+  case RISCVISD::SD_RV32:
+    // Don't allow stores of Add. It must only be used as the address.
+    if (User->getOperand(0) == Add || User->getOperand(1) == Add)
+      return false;
+    break;
+  case ISD::ATOMIC_STORE:
+    // Don't allow stores of Add. It must only be used as the address.
+    if (cast<AtomicSDNode>(User)->getVal() == Add)
+      return false;
+    break;
+  }
+
+  return true;
+}
+
+// To prevent SelectAddrRegImm from folding offsets that conflict with the
+// fusion of PseudoMovAddr, check if the offset of every use of a given address
+// is within the alignment.
+bool RISCVDAGToDAGISel::areOffsetsWithinAlignment(SDValue Addr,
+                                                  Align Alignment) {
+  assert(Addr->getOpcode() == RISCVISD::ADD_LO);
+  for (auto *User : Addr->users()) {
+    // If the user is a load or store, then the offset is 0 which is always
+    // within alignment.
+    if (isRegImmLoadOrStore(User, Addr))
+      continue;
+
+    if (CurDAG->isBaseWithConstantOffset(SDValue(User, 0))) {
+      int64_t CVal = cast<ConstantSDNode>(User->getOperand(1))->getSExtValue();
+      if (!isInt<12>(CVal) || Alignment <= CVal)
+        return false;
+
+      // Make sure all uses are foldable load/stores.
+      for (auto *AddUser : User->users())
+        if (!isRegImmLoadOrStore(AddUser, SDValue(User, 0)))
+          return false;
+
+      continue;
+    }
+
+    return false;
+  }
+
+  return true;
+}
+
 bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
                                          SDValue &Offset) {
   if (SelectAddrFrameIndex(Addr, Base, Offset))
@@ -2960,9 +2973,21 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
   MVT VT = Addr.getSimpleValueType();
 
   if (Addr.getOpcode() == RISCVISD::ADD_LO) {
-    Base = Addr.getOperand(0);
-    Offset = Addr.getOperand(1);
-    return true;
+    bool CanFold = true;
+    // Unconditionally fold if operand 1 is not a global address (e.g.
+    // externsymbol)
+    if (auto *GA = dyn_cast<GlobalAddressSDNode>(Addr.getOperand(1))) {
+      const DataLayout &DL = CurDAG->getDataLayout();
+      Align Alignment = commonAlignment(
+          GA->getGlobal()->getPointerAlignment(DL), GA->getOffset());
+      if (!areOffsetsWithinAlignment(Addr, Alignment))
+        CanFold = false;
+    }
+    if (CanFold) {
+      Base = Addr.getOperand(0);
+      Offset = Addr.getOperand(1);
+      return true;
+    }
   }
 
   if (CurDAG->isBaseWithConstantOffset(Addr)) {
@@ -2980,7 +3005,8 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
           const DataLayout &DL = CurDAG->getDataLayout();
           Align Alignment = commonAlignment(
               GA->getGlobal()->getPointerAlignment(DL), GA->getOffset());
-          if ((CVal == 0 || Alignment > CVal)) {
+          if ((CVal == 0 || Alignment > CVal) &&
+              areOffsetsWithinAlignment(Base, Alignment)) {
             int64_t CombinedOffset = CVal + GA->getOffset();
             Base = Base.getOperand(0);
             Offset = CurDAG->getTargetGlobalAddress(
@@ -3983,6 +4009,15 @@ bool RISCVDAGToDAGISel::hasAllNBitUsers(SDNode *Node, unsigned Bits,
       if (Use.getOperandNo() == 0 && Bits >= 32)
         break;
       return false;
+    case RISCV::TH_EXT:
+    case RISCV::TH_EXTU: {
+      unsigned Msb = User->getConstantOperandVal(1);
+      unsigned Lsb = User->getConstantOperandVal(2);
+      // Behavior of Msb < Lsb is not well documented.
+      if (Msb >= Lsb && Bits > Msb)
+        break;
+      return false;
+    }
     }
   }
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
index c329a4c6ec62..cf2f763abc06 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -45,6 +45,8 @@ public:
                                     InlineAsm::ConstraintCode ConstraintID,
                                     std::vector<SDValue> &OutOps) override;
 
+  bool areOffsetsWithinAlignment(SDValue Addr, Align Alignment);
+
   bool SelectAddrFrameIndex(SDValue Addr, SDValue &Base, SDValue &Offset);
   bool SelectAddrRegImm(SDValue Addr, SDValue &Base, SDValue &Offset);
   bool SelectAddrRegImm9(SDValue Addr, SDValue &Base, SDValue &Offset);
@@ -75,7 +77,6 @@ public:
   bool trySignedBitfieldExtract(SDNode *Node);
   bool trySignedBitfieldInsertInSign(SDNode *Node);
   bool trySignedBitfieldInsertInMask(SDNode *Node);
-  bool tryBitfieldInsertOpFromXor(SDNode *Node);
   bool tryBitfieldInsertOpFromOrAndImm(SDNode *Node);
   bool tryUnsignedBitfieldExtract(SDNode *Node, const SDLoc &DL, MVT VT,
                                   SDValue X, unsigned Msb, unsigned Lsb);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index a33224845e2b..a68a3c14dc41 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2173,7 +2173,7 @@ bool RISCVTargetLowering::isMaskAndCmp0FoldingBeneficial(
   // on the basis that it's possible the sinking+duplication of the AND in
   // CodeGenPrepare triggered by this hook wouldn't decrease the instruction
   // count and would increase code size (e.g. ANDI+BNEZ => BEXTI+BNEZ).
-  if (!Subtarget.hasStdExtZbs() && !Subtarget.hasVendorXTHeadBs())
+  if (!Subtarget.hasBEXTILike())
     return false;
   ConstantInt *Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
   if (!Mask)
@@ -3744,9 +3744,11 @@ static SDValue matchSplatAsGather(SDValue SplatVal, MVT VT, const SDLoc &DL,
   // different
   // FIXME: Support i1 vectors, maybe by promoting to i8?
   MVT EltTy = VT.getVectorElementType();
+  if (EltTy == MVT::i1 ||
+      !DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))
+    return SDValue();
   MVT SrcVT = Src.getSimpleValueType();
-  if (EltTy == MVT::i1 || EltTy != SrcVT.getVectorElementType() ||
-      !DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
+  if (EltTy != SrcVT.getVectorElementType())
     return SDValue();
   SDValue Idx = SplatVal.getOperand(1);
   // The index must be a legal type.
@@ -4518,41 +4520,104 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
 
   const unsigned Policy = RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC;
 
+  // General case: splat the first operand and slide other operands down one
+  // by one to form a vector. Alternatively, if every operand is an
+  // extraction from element 0 of a vector, we use that vector from the last
+  // extraction as the start value and slide up instead of slide down. Such that
+  // (1) we can avoid the initial splat (2) we can turn those vslide1up into
+  // vslideup of 1 later and eliminate the vector to scalar movement, which is
+  // something we cannot do with vslide1down/vslidedown.
+  // Of course, using vslide1up/vslideup might increase the register pressure,
+  // and that's why we conservatively limit to cases where every operand is an
+  // extraction from the first element.
+  SmallVector<SDValue> Operands(Op->op_begin(), Op->op_end());
+  SDValue EVec;
+  bool SlideUp = false;
+  auto getVSlide = [&](EVT ContainerVT, SDValue Passthru, SDValue Vec,
+                       SDValue Offset, SDValue Mask, SDValue VL) -> SDValue {
+    if (SlideUp)
+      return getVSlideup(DAG, Subtarget, DL, ContainerVT, Passthru, Vec, Offset,
+                         Mask, VL, Policy);
+    return getVSlidedown(DAG, Subtarget, DL, ContainerVT, Passthru, Vec, Offset,
+                         Mask, VL, Policy);
+  };
+
+  // The reason we don't use all_of here is because we're also capturing EVec
+  // from the last non-undef operand. If the std::execution_policy of the
+  // underlying std::all_of is anything but std::sequenced_policy we might
+  // capture the wrong EVec.
+  for (SDValue V : Operands) {
+    using namespace SDPatternMatch;
+    SlideUp = V.isUndef() || sd_match(V, m_ExtractElt(m_Value(EVec), m_Zero()));
+    if (!SlideUp)
+      break;
+  }
+
+  if (SlideUp) {
+    MVT EVecContainerVT = EVec.getSimpleValueType();
+    // Make sure the original vector has scalable vector type.
+    if (EVecContainerVT.isFixedLengthVector()) {
+      EVecContainerVT =
+          getContainerForFixedLengthVector(DAG, EVecContainerVT, Subtarget);
+      EVec = convertToScalableVector(EVecContainerVT, EVec, DAG, Subtarget);
+    }
+
+    // Adapt EVec's type into ContainerVT.
+    if (EVecContainerVT.getVectorMinNumElements() <
+        ContainerVT.getVectorMinNumElements())
+      EVec = DAG.getInsertSubvector(DL, DAG.getUNDEF(ContainerVT), EVec, 0);
+    else
+      EVec = DAG.getExtractSubvector(DL, ContainerVT, EVec, 0);
+
+    // Reverse the elements as we're going to slide up from the last element.
+    std::reverse(Operands.begin(), Operands.end());
+  }
+
   SDValue Vec;
   UndefCount = 0;
-  for (SDValue V : Op->ops()) {
+  for (SDValue V : Operands) {
     if (V.isUndef()) {
       UndefCount++;
       continue;
     }
 
-    // Start our sequence with a TA splat in the hopes that hardware is able to
-    // recognize there's no dependency on the prior value of our temporary
-    // register.
+    // Start our sequence with either a TA splat or extract source in the
+    // hopes that hardware is able to recognize there's no dependency on the
+    // prior value of our temporary register.
     if (!Vec) {
-      Vec = DAG.getSplatVector(VT, DL, V);
-      Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
+      if (SlideUp) {
+        Vec = EVec;
+      } else {
+        Vec = DAG.getSplatVector(VT, DL, V);
+        Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
+      }
+
       UndefCount = 0;
       continue;
     }
 
     if (UndefCount) {
       const SDValue Offset = DAG.getConstant(UndefCount, DL, Subtarget.getXLenVT());
-      Vec = getVSlidedown(DAG, Subtarget, DL, ContainerVT, DAG.getUNDEF(ContainerVT),
-                          Vec, Offset, Mask, VL, Policy);
+      Vec = getVSlide(ContainerVT, DAG.getUNDEF(ContainerVT), Vec, Offset, Mask,
+                      VL);
       UndefCount = 0;
     }
-    auto OpCode =
-      VT.isFloatingPoint() ? RISCVISD::VFSLIDE1DOWN_VL : RISCVISD::VSLIDE1DOWN_VL;
+
+    unsigned Opcode;
+    if (VT.isFloatingPoint())
+      Opcode = SlideUp ? RISCVISD::VFSLIDE1UP_VL : RISCVISD::VFSLIDE1DOWN_VL;
+    else
+      Opcode = SlideUp ? RISCVISD::VSLIDE1UP_VL : RISCVISD::VSLIDE1DOWN_VL;
+
     if (!VT.isFloatingPoint())
       V = DAG.getNode(ISD::ANY_EXTEND, DL, Subtarget.getXLenVT(), V);
-    Vec = DAG.getNode(OpCode, DL, ContainerVT, DAG.getUNDEF(ContainerVT), Vec,
+    Vec = DAG.getNode(Opcode, DL, ContainerVT, DAG.getUNDEF(ContainerVT), Vec,
                       V, Mask, VL);
   }
   if (UndefCount) {
     const SDValue Offset = DAG.getConstant(UndefCount, DL, Subtarget.getXLenVT());
-    Vec = getVSlidedown(DAG, Subtarget, DL, ContainerVT, DAG.getUNDEF(ContainerVT),
-                        Vec, Offset, Mask, VL, Policy);
+    Vec = getVSlide(ContainerVT, DAG.getUNDEF(ContainerVT), Vec, Offset, Mask,
+                    VL);
   }
   return convertFromScalableVector(VT, Vec, DAG, Subtarget);
 }
@@ -8193,6 +8258,13 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
               DL, VT, LHS, DAG.getSignedConstant(Imm + 1, DL, OpVT), CCVal);
           return DAG.getLogicalNOT(DL, SetCC, VT);
         }
+        // Lower (setugt X, 2047) as (setne (srl X, 11), 0).
+        if (CCVal == ISD::SETUGT && Imm == 2047) {
+          SDValue Shift = DAG.getNode(ISD::SRL, DL, OpVT, LHS,
+                                      DAG.getShiftAmountConstant(11, OpVT, DL));
+          return DAG.getSetCC(DL, VT, Shift, DAG.getConstant(0, DL, OpVT),
+                              ISD::SETNE);
+        }
       }
 
       // Not a constant we could handle, swap the operands and condition code to
@@ -8815,7 +8887,15 @@ SDValue RISCVTargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
     reportFatalUsageError("Unsupported code model for lowering");
   case CodeModel::Small: {
     // Generate a sequence for accessing addresses within the first 2 GiB of
-    // address space. This generates the pattern (addi (lui %hi(sym)) %lo(sym)).
+    // address space.
+    if (Subtarget.hasVendorXqcili()) {
+      // Use QC.E.LI to generate the address, as this is easier to relax than
+      // LUI/ADDI.
+      SDValue Addr = getTargetNode(N, DL, Ty, DAG, 0);
+      return DAG.getNode(RISCVISD::QC_E_LI, DL, Ty, Addr);
+    }
+
+    // This generates the pattern (addi (lui %hi(sym)) %lo(sym)).
     SDValue AddrHi = getTargetNode(N, DL, Ty, DAG, RISCVII::MO_HI);
     SDValue AddrLo = getTargetNode(N, DL, Ty, DAG, RISCVII::MO_LO);
     SDValue MNHi = DAG.getNode(RISCVISD::HI, DL, Ty, AddrHi);
@@ -9036,8 +9116,12 @@ static std::optional<bool> matchSetCC(SDValue LHS, SDValue RHS,
   return std::nullopt;
 }
 
-static SDValue combineSelectToBinOp(SDNode *N, SelectionDAG &DAG,
-                                    const RISCVSubtarget &Subtarget) {
+static bool isSimm12Constant(SDValue V) {
+  return isa<ConstantSDNode>(V) && V->getAsAPIntVal().isSignedIntN(12);
+}
+
+static SDValue lowerSelectToBinOp(SDNode *N, SelectionDAG &DAG,
+                                  const RISCVSubtarget &Subtarget) {
   SDValue CondV = N->getOperand(0);
   SDValue TrueV = N->getOperand(1);
   SDValue FalseV = N->getOperand(2);
@@ -9057,14 +9141,16 @@ static SDValue combineSelectToBinOp(SDNode *N, SelectionDAG &DAG,
       return DAG.getNode(ISD::OR, DL, VT, Neg, DAG.getFreeze(TrueV));
     }
 
+    const bool HasCZero = VT.isScalarInteger() && Subtarget.hasCZEROLike();
+
     // (select c, 0, y) -> (c-1) & y
-    if (isNullConstant(TrueV)) {
-      SDValue Neg = DAG.getNode(ISD::ADD, DL, VT, CondV,
-                                DAG.getAllOnesConstant(DL, VT));
+    if (isNullConstant(TrueV) && (!HasCZero || isSimm12Constant(FalseV))) {
+      SDValue Neg =
+          DAG.getNode(ISD::ADD, DL, VT, CondV, DAG.getAllOnesConstant(DL, VT));
       return DAG.getNode(ISD::AND, DL, VT, Neg, DAG.getFreeze(FalseV));
     }
     // (select c, y, 0) -> -c & y
-    if (isNullConstant(FalseV)) {
+    if (isNullConstant(FalseV) && (!HasCZero || isSimm12Constant(TrueV))) {
       SDValue Neg = DAG.getNegative(CondV, DL, VT);
       return DAG.getNode(ISD::AND, DL, VT, Neg, DAG.getFreeze(TrueV));
     }
@@ -9185,12 +9271,16 @@ SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const {
     return DAG.getNode(ISD::VSELECT, DL, VT, CondSplat, TrueV, FalseV);
   }
 
+  // Try some other optimizations before falling back to generic lowering.
+  if (SDValue V = lowerSelectToBinOp(Op.getNode(), DAG, Subtarget))
+    return V;
+
   // When Zicond or XVentanaCondOps is present, emit CZERO_EQZ and CZERO_NEZ
   // nodes to implement the SELECT. Performing the lowering here allows for
   // greater control over when CZERO_{EQZ/NEZ} are used vs another branchless
   // sequence or RISCVISD::SELECT_CC node (branch-based select).
-  if ((Subtarget.hasStdExtZicond() || Subtarget.hasVendorXVentanaCondOps()) &&
-      VT.isScalarInteger()) {
+  if (Subtarget.hasCZEROLike() && VT.isScalarInteger()) {
+
     // (select c, t, 0) -> (czero_eqz t, c)
     if (isNullConstant(FalseV))
       return DAG.getNode(RISCVISD::CZERO_EQZ, DL, VT, TrueV, CondV);
@@ -9244,10 +9334,6 @@ SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const {
           DAG.getNode(RISCVISD::CZERO_EQZ, DL, VT, TrueV, CondV));
     }
 
-    // Try some other optimizations before falling back to generic lowering.
-    if (SDValue V = combineSelectToBinOp(Op.getNode(), DAG, Subtarget))
-      return V;
-
     // (select c, c1, c2) -> (add (czero_nez c2 - c1, c), c1)
     // (select c, c1, c2) -> (add (czero_eqz c1 - c2, c), c2)
     if (isa<ConstantSDNode>(TrueV) && isa<ConstantSDNode>(FalseV)) {
@@ -9280,19 +9366,38 @@ SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const {
         }
       }
 
-      const int TrueValCost = RISCVMatInt::getIntMatCost(
-          TrueVal, Subtarget.getXLen(), Subtarget, /*CompressionCost=*/true);
-      const int FalseValCost = RISCVMatInt::getIntMatCost(
-          FalseVal, Subtarget.getXLen(), Subtarget, /*CompressionCost=*/true);
-      bool IsCZERO_NEZ = TrueValCost <= FalseValCost;
+      // Use SHL/ADDI (and possible XORI) to avoid having to materialize
+      // a constant in register
+      if ((TrueVal - FalseVal).isPowerOf2() && FalseVal.isSignedIntN(12)) {
+        SDValue Log2 = DAG.getConstant((TrueVal - FalseVal).logBase2(), DL, VT);
+        SDValue BitDiff = DAG.getNode(ISD::SHL, DL, VT, CondV, Log2);
+        return DAG.getNode(ISD::ADD, DL, VT, FalseV, BitDiff);
+      }
+      if ((FalseVal - TrueVal).isPowerOf2() && TrueVal.isSignedIntN(12)) {
+        SDValue Log2 = DAG.getConstant((FalseVal - TrueVal).logBase2(), DL, VT);
+        CondV = DAG.getLogicalNOT(DL, CondV, CondV->getValueType(0));
+        SDValue BitDiff = DAG.getNode(ISD::SHL, DL, VT, CondV, Log2);
+        return DAG.getNode(ISD::ADD, DL, VT, TrueV, BitDiff);
+      }
+
+      auto getCost = [&](const APInt &Delta, const APInt &Addend) {
+        const int DeltaCost = RISCVMatInt::getIntMatCost(
+            Delta, Subtarget.getXLen(), Subtarget, /*CompressionCost=*/true);
+        // Does the addend fold into an ADDI
+        if (Addend.isSignedIntN(12))
+          return DeltaCost;
+        const int AddendCost = RISCVMatInt::getIntMatCost(
+            Addend, Subtarget.getXLen(), Subtarget, /*CompressionCost=*/true);
+        return AddendCost + DeltaCost;
+      };
+      bool IsCZERO_NEZ = getCost(FalseVal - TrueVal, TrueVal) <=
+                         getCost(TrueVal - FalseVal, FalseVal);
       SDValue LHSVal = DAG.getConstant(
           IsCZERO_NEZ ? FalseVal - TrueVal : TrueVal - FalseVal, DL, VT);
-      SDValue RHSVal =
-          DAG.getConstant(IsCZERO_NEZ ? TrueVal : FalseVal, DL, VT);
       SDValue CMOV =
           DAG.getNode(IsCZERO_NEZ ? RISCVISD::CZERO_NEZ : RISCVISD::CZERO_EQZ,
                       DL, VT, LHSVal, CondV);
-      return DAG.getNode(ISD::ADD, DL, VT, CMOV, RHSVal);
+      return DAG.getNode(ISD::ADD, DL, VT, CMOV, IsCZERO_NEZ ? TrueV : FalseV);
     }
 
     // (select c, c1, t) -> (add (czero_nez t - c1, c), c1)
@@ -9327,12 +9432,10 @@ SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const {
       return DAG.getNode(
           ISD::OR, DL, VT,
           DAG.getNode(RISCVISD::CZERO_EQZ, DL, VT, TrueV, CondV),
-          DAG.getNode(RISCVISD::CZERO_NEZ, DL, VT, FalseV, CondV));
+          DAG.getNode(RISCVISD::CZERO_NEZ, DL, VT, FalseV, CondV),
+          SDNodeFlags::Disjoint);
   }
 
-  if (SDValue V = combineSelectToBinOp(Op.getNode(), DAG, Subtarget))
-    return V;
-
   if (Op.hasOneUse()) {
     unsigned UseOpc = Op->user_begin()->getOpcode();
     if (isBinOp(UseOpc) && DAG.isSafeToSpeculativelyExecute(UseOpc)) {
@@ -10738,11 +10841,11 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getNode(Opc, DL, XLenVT, Op.getOperand(1));
   }
   case Intrinsic::riscv_mopr:
-    return DAG.getNode(RISCVISD::MOPR, DL, XLenVT, Op.getOperand(1),
+    return DAG.getNode(RISCVISD::MOP_R, DL, XLenVT, Op.getOperand(1),
                        Op.getOperand(2));
 
   case Intrinsic::riscv_moprr: {
-    return DAG.getNode(RISCVISD::MOPRR, DL, XLenVT, Op.getOperand(1),
+    return DAG.getNode(RISCVISD::MOP_RR, DL, XLenVT, Op.getOperand(1),
                        Op.getOperand(2), Op.getOperand(3));
   }
   case Intrinsic::riscv_clmul:
@@ -14877,7 +14980,7 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
       SDValue NewOp =
           DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1));
       SDValue Res = DAG.getNode(
-          RISCVISD::MOPR, DL, MVT::i64, NewOp,
+          RISCVISD::MOP_R, DL, MVT::i64, NewOp,
           DAG.getTargetConstant(N->getConstantOperandVal(2), DL, MVT::i64));
       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
       return;
@@ -14890,7 +14993,7 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
       SDValue NewOp1 =
           DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(2));
       SDValue Res = DAG.getNode(
-          RISCVISD::MOPRR, DL, MVT::i64, NewOp0, NewOp1,
+          RISCVISD::MOP_RR, DL, MVT::i64, NewOp0, NewOp1,
           DAG.getTargetConstant(N->getConstantOperandVal(3), DL, MVT::i64));
       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
       return;
@@ -15381,9 +15484,7 @@ static SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
 
   if (!Subtarget.hasConditionalMoveFusion()) {
     // (select cond, x, (and x, c)) has custom lowering with Zicond.
-    if ((!Subtarget.hasStdExtZicond() &&
-         !Subtarget.hasVendorXVentanaCondOps()) ||
-        N->getOpcode() != ISD::AND)
+    if (!Subtarget.hasCZEROLike() || N->getOpcode() != ISD::AND)
       return SDValue();
 
     // Maybe harmful when condition code has multiple use.
@@ -16059,12 +16160,55 @@ static SDValue combineOrOfCZERO(SDNode *N, SDValue N0, SDValue N1,
 
   SDValue NewN0 = DAG.getNode(RISCVISD::CZERO_EQZ, DL, VT, TrueV.getOperand(0),
                               Cond);
-  SDValue NewN1 = DAG.getNode(RISCVISD::CZERO_NEZ, DL, VT, FalseV.getOperand(0),
-                              Cond);
-  SDValue NewOr = DAG.getNode(ISD::OR, DL, VT, NewN0, NewN1);
+  SDValue NewN1 =
+      DAG.getNode(RISCVISD::CZERO_NEZ, DL, VT, FalseV.getOperand(0), Cond);
+  SDValue NewOr =
+      DAG.getNode(ISD::OR, DL, VT, NewN0, NewN1, SDNodeFlags::Disjoint);
   return DAG.getNode(ISD::XOR, DL, VT, NewOr, TrueV.getOperand(1));
 }
 
+// (xor X, (xor (and X, C2), Y))
+// ->(qc_insb X, (sra Y, ShAmt), Width, ShAmt)
+// where C2 is a shifted mask with width = Width and shift = ShAmt
+// qc_insb might become qc.insb or qc.insbi depending on the operands.
+static SDValue combineXorToBitfieldInsert(SDNode *N, SelectionDAG &DAG,
+                                          const RISCVSubtarget &Subtarget) {
+  if (!Subtarget.hasVendorXqcibm())
+    return SDValue();
+
+  using namespace SDPatternMatch;
+
+  SDValue Base, Inserted;
+  APInt CMask;
+  if (!sd_match(N, m_Xor(m_Value(Base),
+                         m_OneUse(m_Xor(m_OneUse(m_And(m_Deferred(Base),
+                                                       m_ConstInt(CMask))),
+                                        m_Value(Inserted))))))
+    return SDValue();
+
+  if (N->getValueType(0) != MVT::i32)
+    return SDValue();
+
+  unsigned Width, ShAmt;
+  if (!CMask.isShiftedMask(ShAmt, Width))
+    return SDValue();
+
+  // Check if all zero bits in CMask are also zero in Inserted
+  if (!DAG.MaskedValueIsZero(Inserted, ~CMask))
+    return SDValue();
+
+  SDLoc DL(N);
+
+  // `Inserted` needs to be right shifted before it is put into the
+  // instruction.
+  Inserted = DAG.getNode(ISD::SRA, DL, MVT::i32, Inserted,
+                         DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
+
+  SDValue Ops[] = {Base, Inserted, DAG.getConstant(Width, DL, MVT::i32),
+                   DAG.getConstant(ShAmt, DL, MVT::i32)};
+  return DAG.getNode(RISCVISD::QC_INSB, DL, MVT::i32, Ops);
+}
+
 static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
                                 const RISCVSubtarget &Subtarget) {
   SelectionDAG &DAG = DCI.DAG;
@@ -16108,8 +16252,8 @@ static SDValue performXORCombine(SDNode *N, SelectionDAG &DAG,
     SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N0.getOperand(0));
     SDValue Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N0.getOperand(1));
     SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i64, Op0, Op1);
-    SDValue And = DAG.getNOT(DL, Shl, MVT::i64);
-    return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, And);
+    SDValue Not = DAG.getNOT(DL, Shl, MVT::i64);
+    return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Not);
   }
 
   // fold (xor (sllw 1, x), -1) -> (rolw ~1, x)
@@ -16137,6 +16281,9 @@ static SDValue performXORCombine(SDNode *N, SelectionDAG &DAG,
     }
   }
 
+  if (SDValue V = combineXorToBitfieldInsert(N, DAG, Subtarget))
+    return V;
+
   if (SDValue V = combineBinOpToReduce(N, DAG, Subtarget))
     return V;
   if (SDValue V = combineBinOpOfExtractToReduceTree(N, DAG, Subtarget))
@@ -16590,10 +16737,6 @@ combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC,
                       DAG.getConstant(0, DL, XLenVT), CC);
 }
 
-// Replace (seteq (i64 (and X, 0xffffffff)), C1) with
-// (seteq (i64 (sext_inreg (X, i32)), C1')) where C1' is C1 sign extended from
-// bit 31. Same for setne. C1' may be cheaper to materialize and the sext_inreg
-// can become a sext.w instead of a shift pair.
 static SDValue performSETCCCombine(SDNode *N,
                                    TargetLowering::DAGCombinerInfo &DCI,
                                    const RISCVSubtarget &Subtarget) {
@@ -16613,20 +16756,44 @@ static SDValue performSETCCCombine(SDNode *N,
           combineVectorSizedSetCCEquality(VT, N0, N1, Cond, dl, DAG, Subtarget))
     return V;
 
-  // (X & -4096) == 0 -> (X >> 12) == 0 if the AND constant can't use ANDI.
-  if (DCI.isAfterLegalizeDAG() && isNullConstant(N1) &&
+  if (DCI.isAfterLegalizeDAG() && isa<ConstantSDNode>(N1) &&
       N0.getOpcode() == ISD::AND && N0.hasOneUse() &&
       isa<ConstantSDNode>(N0.getOperand(1))) {
-    const APInt &AndRHSC =
-        cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
-    if (!isInt<12>(AndRHSC.getSExtValue()) && AndRHSC.isNegatedPowerOf2()) {
+    const APInt &AndRHSC = N0.getConstantOperandAPInt(1);
+    // (X & -(1 << C)) == 0 -> (X >> C) == 0 if the AND constant can't use ANDI.
+    if (isNullConstant(N1) && !isInt<12>(AndRHSC.getSExtValue()) &&
+        AndRHSC.isNegatedPowerOf2()) {
       unsigned ShiftBits = AndRHSC.countr_zero();
-      SDValue Shift = DAG.getNode(ISD::SRL, dl, VT, N0.getOperand(0),
-                                  DAG.getConstant(ShiftBits, dl, VT));
+      SDValue Shift = DAG.getNode(ISD::SRL, dl, OpVT, N0.getOperand(0),
+                                  DAG.getConstant(ShiftBits, dl, OpVT));
       return DAG.getSetCC(dl, VT, Shift, N1, Cond);
     }
+
+    // Similar to above but handling the lower 32 bits by using sraiw. Allow
+    // comparing with constants other than 0 if the constant can be folded into
+    // addi or xori after shifting.
+    uint64_t N1Int = cast<ConstantSDNode>(N1)->getZExtValue();
+    uint64_t AndRHSInt = AndRHSC.getZExtValue();
+    if (OpVT == MVT::i64 && AndRHSInt <= 0xffffffff &&
+        isPowerOf2_32(-uint32_t(AndRHSInt)) && (N1Int & AndRHSInt) == N1Int) {
+      unsigned ShiftBits = llvm::countr_zero(AndRHSInt);
+      int64_t NewC = SignExtend64<32>(N1Int) >> ShiftBits;
+      if (NewC >= -2048 && NewC <= 2048) {
+        SDValue SExt =
+            DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, OpVT, N0.getOperand(0),
+                        DAG.getValueType(MVT::i32));
+        SDValue Shift = DAG.getNode(ISD::SRA, dl, OpVT, SExt,
+                                    DAG.getConstant(ShiftBits, dl, OpVT));
+        return DAG.getSetCC(dl, VT, Shift,
+                            DAG.getSignedConstant(NewC, dl, OpVT), Cond);
+      }
+    }
   }
 
+  // Replace (seteq (i64 (and X, 0xffffffff)), C1) with
+  // (seteq (i64 (sext_inreg (X, i32)), C1')) where C1' is C1 sign extended from
+  // bit 31. Same for setne. C1' may be cheaper to materialize and the
+  // sext_inreg can become a sext.w instead of a shift pair.
   if (OpVT != MVT::i64 || !Subtarget.is64Bit())
     return SDValue();
 
@@ -18674,7 +18841,7 @@ static SDValue tryFoldSelectIntoOp(SDNode *N, SelectionDAG &DAG,
     break;
   }
 
-  if (!TrueVal.hasOneUse() || isa<ConstantSDNode>(FalseVal))
+  if (!TrueVal.hasOneUse())
     return SDValue();
 
   unsigned OpToFold;
@@ -18746,6 +18913,10 @@ static SDValue foldSelectOfCTTZOrCTLZ(SDNode *N, SelectionDAG &DAG) {
   if (Cond->getOperand(0) != CountZeroesArgument)
     return SDValue();
 
+  unsigned BitWidth = CountZeroes.getValueSizeInBits();
+  if (!isPowerOf2_32(BitWidth))
+    return SDValue();
+
   if (CountZeroes.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
     CountZeroes = DAG.getNode(ISD::CTTZ, SDLoc(CountZeroes),
                               CountZeroes.getValueType(), CountZeroesArgument);
@@ -18754,7 +18925,6 @@ static SDValue foldSelectOfCTTZOrCTLZ(SDNode *N, SelectionDAG &DAG) {
                               CountZeroes.getValueType(), CountZeroesArgument);
   }
 
-  unsigned BitWidth = CountZeroes.getValueSizeInBits();
   SDValue BitWidthMinusOne =
       DAG.getConstant(BitWidth - 1, SDLoc(N), CountZeroes.getValueType());
 
@@ -18778,7 +18948,7 @@ static SDValue useInversedSetcc(SDNode *N, SelectionDAG &DAG,
   // Replace (setcc eq (and x, C)) with (setcc ne (and x, C))) to generate
   // BEXTI, where C is power of 2.
   if (Subtarget.hasStdExtZbs() && VT.isScalarInteger() &&
-      (Subtarget.hasStdExtZicond() || Subtarget.hasVendorXVentanaCondOps())) {
+      (Subtarget.hasCZEROLike() || Subtarget.hasVendorXTHeadCondMov())) {
     SDValue LHS = Cond.getOperand(0);
     SDValue RHS = Cond.getOperand(1);
     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
@@ -18953,6 +19123,7 @@ static SDValue foldReduceOperandViaVQDOT(SDValue InVec, const SDLoc &DL,
                                          SelectionDAG &DAG,
                                          const RISCVSubtarget &Subtarget,
                                          const RISCVTargetLowering &TLI) {
+  using namespace SDPatternMatch;
   // Note: We intentionally do not check the legality of the reduction type.
   // We want to handle the m4/m8 *src*  types, and thus need to let illegal
   // intermediate types flow through here.
@@ -18960,11 +19131,10 @@ static SDValue foldReduceOperandViaVQDOT(SDValue InVec, const SDLoc &DL,
       !InVec.getValueType().getVectorElementCount().isKnownMultipleOf(4))
     return SDValue();
 
-  // Recurse through adds (since generic dag canonicalizes to that
-  // form). TODO: Handle disjoint or here.
-  if (InVec->getOpcode() == ISD::ADD) {
-    SDValue A = InVec.getOperand(0);
-    SDValue B = InVec.getOperand(1);
+  // Recurse through adds/disjoint ors (since generic dag canonicalizes to that
+  // form).
+  SDValue A, B;
+  if (sd_match(InVec, m_AddLike(m_Value(A), m_Value(B)))) {
     SDValue AOpt = foldReduceOperandViaVQDOT(A, DL, DAG, Subtarget, TLI);
     SDValue BOpt = foldReduceOperandViaVQDOT(B, DL, DAG, Subtarget, TLI);
     if (AOpt || BOpt) {
@@ -19001,12 +19171,9 @@ static SDValue foldReduceOperandViaVQDOT(SDValue InVec, const SDLoc &DL,
   // mul (zext a, zext b) -> partial_reduce_umla 0, a, b
   // mul (sext a, zext b) -> partial_reduce_ssmla 0, a, b
   // mul (zext a, sext b) -> partial_reduce_smla 0, b, a (swapped)
-  if (InVec.getOpcode() != ISD::MUL)
+  if (!sd_match(InVec, m_Mul(m_Value(A), m_Value(B))))
     return SDValue();
 
-  SDValue A = InVec.getOperand(0);
-  SDValue B = InVec.getOperand(1);
-
   if (!ISD::isExtOpcode(A.getOpcode()))
     return SDValue();
 
@@ -20081,6 +20248,17 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
       return V;
     break;
   case ISD::FMUL: {
+    using namespace SDPatternMatch;
+    SDLoc DL(N);
+    EVT VT = N->getValueType(0);
+    SDValue X, Y;
+    // InstCombine canonicalizes fneg (fmul x, y) -> fmul x, (fneg y), see
+    // hoistFNegAboveFMulFDiv.
+    // Undo this and sink the fneg so we match more fmsub/fnmadd patterns.
+    if (sd_match(N, m_FMul(m_Value(X), m_OneUse(m_FNeg(m_Value(Y))))))
+      return DAG.getNode(ISD::FNEG, DL, VT,
+                         DAG.getNode(ISD::FMUL, DL, VT, X, Y));
+
     // fmul X, (copysign 1.0, Y) -> fsgnjx X, Y
     SDValue N0 = N->getOperand(0);
     SDValue N1 = N->getOperand(1);
@@ -20091,13 +20269,12 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N0->getOperand(0));
     if (!C || !C->getValueAPF().isExactlyValue(+1.0))
       return SDValue();
-    EVT VT = N->getValueType(0);
     if (VT.isVector() || !isOperationLegal(ISD::FCOPYSIGN, VT))
       return SDValue();
     SDValue Sign = N0->getOperand(1);
     if (Sign.getValueType() != VT)
       return SDValue();
-    return DAG.getNode(RISCVISD::FSGNJX, SDLoc(N), VT, N1, N0->getOperand(1));
+    return DAG.getNode(RISCVISD::FSGNJX, DL, VT, N1, N0->getOperand(1));
   }
   case ISD::FADD:
   case ISD::UMAX:
@@ -20381,9 +20558,9 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
             VT, DL, MGN->getChain(), BasePtr,
             DAG.getSignedConstant(StepNumerator, DL, XLenVT), MGN->getMask(),
             EVL, MGN->getMemOperand());
-        SDValue VPSelect = DAG.getNode(ISD::VP_SELECT, DL, VT, MGN->getMask(),
-                                       StridedLoad, MGN->getPassThru(), EVL);
-        return DAG.getMergeValues({VPSelect, SDValue(StridedLoad.getNode(), 1)},
+        SDValue Select = DAG.getSelect(DL, VT, MGN->getMask(), StridedLoad,
+                                       MGN->getPassThru());
+        return DAG.getMergeValues({Select, SDValue(StridedLoad.getNode(), 1)},
                                   DL);
       }
     }
@@ -21060,6 +21237,38 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
       return N->getOperand(0);
     break;
   }
+  case RISCVISD::VSLIDE1UP_VL:
+  case RISCVISD::VFSLIDE1UP_VL: {
+    using namespace SDPatternMatch;
+    SDValue SrcVec;
+    SDLoc DL(N);
+    MVT VT = N->getSimpleValueType(0);
+    // If the scalar we're sliding in was extracted from the first element of a
+    // vector, we can use that vector as the passthru in a normal slideup of 1.
+    // This saves us an extract_element instruction (i.e. vfmv.f.s, vmv.x.s).
+    if (!N->getOperand(0).isUndef() ||
+        !sd_match(N->getOperand(2),
+                  m_AnyOf(m_ExtractElt(m_Value(SrcVec), m_Zero()),
+                          m_Node(RISCVISD::VMV_X_S, m_Value(SrcVec)))))
+      break;
+
+    MVT SrcVecVT = SrcVec.getSimpleValueType();
+    if (SrcVecVT.getVectorElementType() != VT.getVectorElementType())
+      break;
+    // Adapt the value type of source vector.
+    if (SrcVecVT.isFixedLengthVector()) {
+      SrcVecVT = getContainerForFixedLengthVector(SrcVecVT);
+      SrcVec = convertToScalableVector(SrcVecVT, SrcVec, DAG, Subtarget);
+    }
+    if (SrcVecVT.getVectorMinNumElements() < VT.getVectorMinNumElements())
+      SrcVec = DAG.getInsertSubvector(DL, DAG.getUNDEF(VT), SrcVec, 0);
+    else
+      SrcVec = DAG.getExtractSubvector(DL, VT, SrcVec, 0);
+
+    return getVSlideup(DAG, Subtarget, DL, VT, SrcVec, N->getOperand(1),
+                       DAG.getConstant(1, DL, XLenVT), N->getOperand(3),
+                       N->getOperand(4));
+  }
   }
 
   return SDValue();
@@ -21120,9 +21329,14 @@ bool RISCVTargetLowering::isDesirableToCommuteWithShift(
     auto *C1 = dyn_cast<ConstantSDNode>(N0->getOperand(1));
     auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
 
-    // Bail if we might break a sh{1,2,3}add pattern.
-    if ((Subtarget.hasStdExtZba() || Subtarget.hasVendorXAndesPerf()) && C2 &&
-        C2->getZExtValue() >= 1 && C2->getZExtValue() <= 3 && N->hasOneUse() &&
+    bool IsShXAdd =
+        (Subtarget.hasStdExtZba() || Subtarget.hasVendorXAndesPerf()) && C2 &&
+        C2->getZExtValue() >= 1 && C2->getZExtValue() <= 3;
+    bool IsQCShlAdd = Subtarget.hasVendorXqciac() && C2 &&
+                      C2->getZExtValue() >= 4 && C2->getZExtValue() <= 31;
+
+    // Bail if we might break a sh{1,2,3}add/qc.shladd pattern.
+    if ((IsShXAdd || IsQCShlAdd) && N->hasOneUse() &&
         N->user_begin()->getOpcode() == ISD::ADD &&
         !isUsedByLdSt(*N->user_begin(), nullptr) &&
         !isa<ConstantSDNode>(N->user_begin()->getOperand(1)))
@@ -21346,6 +21560,24 @@ void RISCVTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     Known = Known.sext(BitWidth);
     break;
   }
+  case RISCVISD::SRLW: {
+    KnownBits Known2;
+    Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known = KnownBits::lshr(Known.trunc(32), Known2.trunc(5).zext(32));
+    // Restore the original width by sign extending.
+    Known = Known.sext(BitWidth);
+    break;
+  }
+  case RISCVISD::SRAW: {
+    KnownBits Known2;
+    Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known = KnownBits::ashr(Known.trunc(32), Known2.trunc(5).zext(32));
+    // Restore the original width by sign extending.
+    Known = Known.sext(BitWidth);
+    break;
+  }
   case RISCVISD::CTZW: {
     KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
     unsigned PossibleTZ = Known2.trunc(32).countMaxTrailingZeros();
@@ -21451,8 +21683,16 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode(
     if (Tmp < 33) return 1;
     return 33;
   }
+  case RISCVISD::SRAW: {
+    unsigned Tmp =
+        DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    // sraw produces at least 33 sign bits. If the input already has more than
+    // 33 sign bits sraw, will preserve them.
+    // TODO: A more precise answer could be calculated depending on known bits
+    // in the shift amount.
+    return std::max(Tmp, 33U);
+  }
   case RISCVISD::SLLW:
-  case RISCVISD::SRAW:
   case RISCVISD::SRLW:
   case RISCVISD::DIVW:
   case RISCVISD::DIVUW:
@@ -21463,9 +21703,7 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode(
   case RISCVISD::FCVT_WU_RV64:
   case RISCVISD::STRICT_FCVT_W_RV64:
   case RISCVISD::STRICT_FCVT_WU_RV64:
-    // TODO: As the result is sign-extended, this is conservatively correct. A
-    // more precise answer could be calculated for SRAW depending on known
-    // bits in the shift amount.
+    // TODO: As the result is sign-extended, this is conservatively correct.
     return 33;
   case RISCVISD::VMV_X_S: {
     // The number of sign bits of the scalar result is computed by obtaining the
@@ -21548,6 +21786,14 @@ bool RISCVTargetLowering::canCreateUndefOrPoisonForTargetNode(
 
   // TODO: Add more target nodes.
   switch (Op.getOpcode()) {
+  case RISCVISD::SLLW:
+  case RISCVISD::SRAW:
+  case RISCVISD::SRLW:
+  case RISCVISD::RORW:
+  case RISCVISD::ROLW:
+    // Only the lower 5 bits of RHS are read, guaranteeing the rotate/shift
+    // amount is bounds.
+    return false;
   case RISCVISD::SELECT_CC:
     // Integer comparisons cannot create poison.
     assert(Op.getOperand(0).getValueType().isInteger() &&
@@ -24683,7 +24929,7 @@ RISCVTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
 
 bool RISCVTargetLowering::shouldFoldSelectWithSingleBitTest(
     EVT VT, const APInt &AndMask) const {
-  if (Subtarget.hasStdExtZicond() || Subtarget.hasVendorXVentanaCondOps())
+  if (Subtarget.hasCZEROLike())
     return !Subtarget.hasStdExtZbs() && AndMask.ugt(1024);
   return TargetLowering::shouldFoldSelectWithSingleBitTest(VT, AndMask);
 }
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index fb63ebcfaace..4581c11356af 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -435,8 +435,8 @@ public:
                             const APInt &GapMask) const override;
 
   bool lowerInterleavedStore(Instruction *Store, Value *Mask,
-                             ShuffleVectorInst *SVI,
-                             unsigned Factor) const override;
+                             ShuffleVectorInst *SVI, unsigned Factor,
+                             const APInt &GapMask) const override;
 
   bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
                                         IntrinsicInst *DI) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVIndirectBranchTracking.cpp b/llvm/lib/Target/RISCV/RISCVIndirectBranchTracking.cpp
index 43621b8f0f33..9664ab345dcb 100644
--- a/llvm/lib/Target/RISCV/RISCVIndirectBranchTracking.cpp
+++ b/llvm/lib/Target/RISCV/RISCVIndirectBranchTracking.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// The pass adds LPAD (AUIPC with rs1 = X0) machine instructions at the
+// The pass adds LPAD (AUIPC with rd = X0) machine instructions at the
 // beginning of each basic block or function that is referenced by an indirect
 // jump/call instruction.
 //
diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormatsC.td b/llvm/lib/Target/RISCV/RISCVInstrFormatsC.td
index 209c3fae63f4..4c7cd05723ac 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrFormatsC.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrFormatsC.td
@@ -54,7 +54,6 @@ class RVInst16CSS<bits<3> funct3, bits<2> opcode, dag outs, dag ins,
     : RVInst16<outs, ins, opcodestr, argstr, [], InstFormatCSS> {
   bits<10> imm;
   bits<5> rs2;
-  bits<5> rs1;
 
   let Inst{15-13} = funct3;
   let Inst{12-7} = imm{5-0};
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 7b4a1de16769..d0bb57a3eaa1 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -80,8 +80,8 @@ namespace llvm::RISCV {
 
 } // end namespace llvm::RISCV
 
-RISCVInstrInfo::RISCVInstrInfo(RISCVSubtarget &STI)
-    : RISCVGenInstrInfo(RISCV::ADJCALLSTACKDOWN, RISCV::ADJCALLSTACKUP),
+RISCVInstrInfo::RISCVInstrInfo(const RISCVSubtarget &STI)
+    : RISCVGenInstrInfo(STI, RISCV::ADJCALLSTACKDOWN, RISCV::ADJCALLSTACKUP),
       STI(STI) {}
 
 #define GET_INSTRINFO_HELPERS
@@ -3511,6 +3511,9 @@ RISCVInstrInfo::getOutliningTypeImpl(const MachineModuleInfo &MMI,
       return outliner::InstrType::Illegal;
   }
 
+  if (isLPAD(MI))
+    return outliner::InstrType::Illegal;
+
   return outliner::InstrType::Legal;
 }
 
@@ -4796,8 +4799,22 @@ unsigned RISCV::getDestLog2EEW(const MCInstrDesc &Desc, unsigned Log2SEW) {
   return Scaled;
 }
 
-/// Given two VL operands, do we know that LHS <= RHS?
+static std::optional<int64_t> getEffectiveImm(const MachineOperand &MO) {
+  assert(MO.isImm() || MO.getReg().isVirtual());
+  if (MO.isImm())
+    return MO.getImm();
+  const MachineInstr *Def =
+      MO.getParent()->getMF()->getRegInfo().getVRegDef(MO.getReg());
+  int64_t Imm;
+  if (isLoadImm(Def, Imm))
+    return Imm;
+  return std::nullopt;
+}
+
+/// Given two VL operands, do we know that LHS <= RHS? Must be used in SSA form.
 bool RISCV::isVLKnownLE(const MachineOperand &LHS, const MachineOperand &RHS) {
+  assert((LHS.isImm() || LHS.getParent()->getMF()->getRegInfo().isSSA()) &&
+         (RHS.isImm() || RHS.getParent()->getMF()->getRegInfo().isSSA()));
   if (LHS.isReg() && RHS.isReg() && LHS.getReg().isVirtual() &&
       LHS.getReg() == RHS.getReg())
     return true;
@@ -4807,9 +4824,11 @@ bool RISCV::isVLKnownLE(const MachineOperand &LHS, const MachineOperand &RHS) {
     return true;
   if (LHS.isImm() && LHS.getImm() == RISCV::VLMaxSentinel)
     return false;
-  if (!LHS.isImm() || !RHS.isImm())
+  std::optional<int64_t> LHSImm = getEffectiveImm(LHS),
+                         RHSImm = getEffectiveImm(RHS);
+  if (!LHSImm || !RHSImm)
     return false;
-  return LHS.getImm() <= RHS.getImm();
+  return LHSImm <= RHSImm;
 }
 
 namespace {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 785c8352d4a5..57ec431749eb 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -62,7 +62,7 @@ enum RISCVMachineCombinerPattern : unsigned {
 class RISCVInstrInfo : public RISCVGenInstrInfo {
 
 public:
-  explicit RISCVInstrInfo(RISCVSubtarget &STI);
+  explicit RISCVInstrInfo(const RISCVSubtarget &STI);
 
   MCInst getNop() const override;
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 23f5a848137c..92552b36aa0b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1698,8 +1698,6 @@ let Predicates = [IsRV32] in {
 def : Pat<(i32 (setlt (i32 GPR:$rs1), 0)), (SRLI GPR:$rs1, 31)>; // compressible
 }
 let Predicates = [IsRV64] in {
-def : Pat<(i64 (seteq (i64 (and GPR:$rs1, 0x0000000080000000)), 0)),
-          (XORI (i64 (SRLIW GPR:$rs1, 31)), 1)>;
 def : Pat<(i64 (setlt (i64 GPR:$rs1), 0)), (SRLI GPR:$rs1, 63)>; // compressible
 def : Pat<(i64 (setlt (sext_inreg GPR:$rs1, i32), 0)), (SRLIW GPR:$rs1, 31)>;
 }
@@ -2330,7 +2328,6 @@ include "RISCVInstrInfoZalasr.td"
 include "RISCVInstrInfoZimop.td"
 include "RISCVInstrInfoZicbo.td"
 include "RISCVInstrInfoZicond.td"
-include "RISCVInstrInfoZicfiss.td"
 include "RISCVInstrInfoZilsd.td"
 
 // Scalar FP
@@ -2359,6 +2356,9 @@ include "RISCVInstrInfoZc.td"
 include "RISCVInstrInfoZcmop.td"
 include "RISCVInstrInfoZclsd.td"
 
+// Control Flow Integriy, this requires Zimop/Zcmop
+include "RISCVInstrInfoZicfiss.td"
+
 // Short Forward Branch
 include "RISCVInstrInfoSFB.td"
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
index c5551fbdec28..9fc73662d970 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
@@ -230,13 +230,17 @@ let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in
 class CStackLoad<bits<3> funct3, string OpcodeStr,
                  DAGOperand cls, DAGOperand opnd>
     : RVInst16CI<funct3, 0b10, (outs cls:$rd), (ins SPMem:$rs1, opnd:$imm),
-                 OpcodeStr, "$rd, ${imm}(${rs1})">;
+                 OpcodeStr, "$rd, ${imm}(${rs1})"> {
+  bits<0> rs1;
+}
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in
 class CStackStore<bits<3> funct3, string OpcodeStr,
                   DAGOperand cls, DAGOperand opnd>
     : RVInst16CSS<funct3, 0b10, (outs), (ins cls:$rs2, SPMem:$rs1, opnd:$imm),
-                  OpcodeStr, "$rs2, ${imm}(${rs1})">;
+                  OpcodeStr, "$rs2, ${imm}(${rs1})"> {
+  bits<0> rs1;
+}
 
 let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in
 class CLoad_ri<bits<3> funct3, string OpcodeStr,
@@ -301,14 +305,6 @@ def C_ADDI4SPN : RVInst16CIW<0b000, 0b00, (outs GPRC:$rd),
   let Inst{5} = imm{3};
 }
 
-let Predicates = [HasStdExtCOrZcd, HasStdExtD] in
-def C_FLD  : CLoad_ri<0b001, "c.fld", FPR64C, uimm8_lsb000>,
-             Sched<[WriteFLD64, ReadFMemBase]> {
-  bits<8> imm;
-  let Inst{12-10} = imm{5-3};
-  let Inst{6-5} = imm{7-6};
-}
-
 def C_LW : CLoad_ri<0b010, "c.lw", GPRC, uimm7_lsb00>,
            Sched<[WriteLDW, ReadMemBase]> {
   bits<7> imm;
@@ -326,16 +322,6 @@ def C_LW_INX : CLoad_ri<0b010, "c.lw", GPRF32C, uimm7_lsb00>,
   let Inst{5} = imm{6};
 }
 
-let DecoderNamespace = "RV32Only",
-    Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in
-def C_FLW  : CLoad_ri<0b011, "c.flw", FPR32C, uimm7_lsb00>,
-             Sched<[WriteFLD32, ReadFMemBase]> {
-  bits<7> imm;
-  let Inst{12-10} = imm{5-3};
-  let Inst{6} = imm{2};
-  let Inst{5} = imm{6};
-}
-
 let Predicates = [HasStdExtZca, IsRV64] in
 def C_LD : CLoad_ri<0b011, "c.ld", GPRC, uimm8_lsb000>,
            Sched<[WriteLDD, ReadMemBase]> {
@@ -344,14 +330,6 @@ def C_LD : CLoad_ri<0b011, "c.ld", GPRC, uimm8_lsb000>,
   let Inst{6-5} = imm{7-6};
 }
 
-let Predicates = [HasStdExtCOrZcd, HasStdExtD] in
-def C_FSD  : CStore_rri<0b101, "c.fsd", FPR64C, uimm8_lsb000>,
-             Sched<[WriteFST64, ReadFStoreData, ReadFMemBase]> {
-  bits<8> imm;
-  let Inst{12-10} = imm{5-3};
-  let Inst{6-5} = imm{7-6};
-}
-
 def C_SW : CStore_rri<0b110, "c.sw", GPRC, uimm7_lsb00>,
            Sched<[WriteSTW, ReadStoreData, ReadMemBase]> {
   bits<7> imm;
@@ -369,16 +347,6 @@ def C_SW_INX : CStore_rri<0b110, "c.sw", GPRF32C, uimm7_lsb00>,
   let Inst{5} = imm{6};
 }
 
-let DecoderNamespace = "RV32Only",
-    Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32]  in
-def C_FSW  : CStore_rri<0b111, "c.fsw", FPR32C, uimm7_lsb00>,
-             Sched<[WriteFST32, ReadFStoreData, ReadFMemBase]> {
-  bits<7> imm;
-  let Inst{12-10} = imm{5-3};
-  let Inst{6} = imm{2};
-  let Inst{5} = imm{6};
-}
-
 let Predicates = [HasStdExtZca, IsRV64] in
 def C_SD : CStore_rri<0b111, "c.sd", GPRC, uimm8_lsb000>,
            Sched<[WriteSTD, ReadStoreData, ReadMemBase]> {
@@ -500,12 +468,6 @@ def C_SLLI : RVInst16CI<0b000, 0b10, (outs GPR:$rd_wb),
   let Constraints = "$rd = $rd_wb";
 }
 
-let Predicates = [HasStdExtCOrZcd, HasStdExtD] in
-def C_FLDSP  : CStackLoad<0b001, "c.fldsp", FPR64, uimm9_lsb000>,
-               Sched<[WriteFLD64, ReadFMemBase]> {
-  let Inst{4-2} = imm{8-6};
-}
-
 def C_LWSP : CStackLoad<0b010, "c.lwsp", GPRNoX0, uimm8_lsb00>,
              Sched<[WriteLDW, ReadMemBase]> {
   let Inst{3-2} = imm{7-6};
@@ -517,13 +479,6 @@ def C_LWSP_INX : CStackLoad<0b010, "c.lwsp", GPRF32NoX0, uimm8_lsb00>,
   let Inst{3-2} = imm{7-6};
 }
 
-let DecoderNamespace = "RV32Only",
-    Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in
-def C_FLWSP  : CStackLoad<0b011, "c.flwsp", FPR32, uimm8_lsb00>,
-               Sched<[WriteFLD32, ReadFMemBase]> {
-  let Inst{3-2} = imm{7-6};
-}
-
 let Predicates = [HasStdExtZca, IsRV64] in
 def C_LDSP : CStackLoad<0b011, "c.ldsp", GPRNoX0, uimm9_lsb000>,
              Sched<[WriteLDD, ReadMemBase]> {
@@ -560,12 +515,6 @@ def C_ADD : RVInst16CR<0b1001, 0b10, (outs GPR:$rd),
   let Constraints = "$rs1 = $rd";
 }
 
-let Predicates = [HasStdExtCOrZcd, HasStdExtD] in
-def C_FSDSP  : CStackStore<0b101, "c.fsdsp", FPR64, uimm9_lsb000>,
-               Sched<[WriteFST64, ReadFStoreData, ReadFMemBase]> {
-  let Inst{9-7}   = imm{8-6};
-}
-
 def C_SWSP : CStackStore<0b110, "c.swsp", GPR, uimm8_lsb00>,
              Sched<[WriteSTW, ReadStoreData, ReadMemBase]> {
   let Inst{8-7}  = imm{7-6};
@@ -577,13 +526,6 @@ def C_SWSP_INX : CStackStore<0b110, "c.swsp", GPRF32, uimm8_lsb00>,
   let Inst{8-7}  = imm{7-6};
 }
 
-let DecoderNamespace = "RV32Only",
-    Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in
-def C_FSWSP  : CStackStore<0b111, "c.fswsp", FPR32, uimm8_lsb00>,
-               Sched<[WriteFST32, ReadFStoreData, ReadFMemBase]> {
-  let Inst{8-7}  = imm{7-6};
-}
-
 let Predicates = [HasStdExtZca, IsRV64] in
 def C_SDSP : CStackStore<0b111, "c.sdsp", GPR, uimm9_lsb000>,
              Sched<[WriteSTD, ReadStoreData, ReadMemBase]> {
@@ -600,6 +542,61 @@ def C_UNIMP : RVInst16<(outs), (ins), "c.unimp", "", [], InstFormatOther>,
 
 } // Predicates = [HasStdExtZca]
 
+let DecoderNamespace = "RV32Only",
+    Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in {
+  def C_FLW  : CLoad_ri<0b011, "c.flw", FPR32C, uimm7_lsb00>,
+               Sched<[WriteFLD32, ReadFMemBase]> {
+    bits<7> imm;
+    let Inst{12-10} = imm{5-3};
+    let Inst{6} = imm{2};
+    let Inst{5} = imm{6};
+  }
+
+  def C_FSW  : CStore_rri<0b111, "c.fsw", FPR32C, uimm7_lsb00>,
+               Sched<[WriteFST32, ReadFStoreData, ReadFMemBase]> {
+    bits<7> imm;
+    let Inst{12-10} = imm{5-3};
+    let Inst{6} = imm{2};
+    let Inst{5} = imm{6};
+  }
+
+  def C_FLWSP  : CStackLoad<0b011, "c.flwsp", FPR32, uimm8_lsb00>,
+                 Sched<[WriteFLD32, ReadFMemBase]> {
+    let Inst{3-2} = imm{7-6};
+  }
+
+  def C_FSWSP  : CStackStore<0b111, "c.fswsp", FPR32, uimm8_lsb00>,
+                 Sched<[WriteFST32, ReadFStoreData, ReadFMemBase]> {
+    let Inst{8-7}  = imm{7-6};
+  }
+} // DecoderNamespace = "RV32Only", Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32]
+
+let Predicates = [HasStdExtCOrZcd, HasStdExtD] in {
+  def C_FLD  : CLoad_ri<0b001, "c.fld", FPR64C, uimm8_lsb000>,
+               Sched<[WriteFLD64, ReadFMemBase]> {
+    bits<8> imm;
+    let Inst{12-10} = imm{5-3};
+    let Inst{6-5} = imm{7-6};
+  }
+
+  def C_FSD  : CStore_rri<0b101, "c.fsd", FPR64C, uimm8_lsb000>,
+               Sched<[WriteFST64, ReadFStoreData, ReadFMemBase]> {
+    bits<8> imm;
+    let Inst{12-10} = imm{5-3};
+    let Inst{6-5} = imm{7-6};
+  }
+
+  def C_FLDSP  : CStackLoad<0b001, "c.fldsp", FPR64, uimm9_lsb000>,
+                 Sched<[WriteFLD64, ReadFMemBase]> {
+    let Inst{4-2} = imm{8-6};
+  }
+
+  def C_FSDSP  : CStackStore<0b101, "c.fsdsp", FPR64, uimm9_lsb000>,
+                 Sched<[WriteFST64, ReadFStoreData, ReadFMemBase]> {
+    let Inst{9-7}   = imm{8-6};
+  }
+} // Predicates = [HasStdExtCOrZcd, HasStdExtD] in {
+
 //===----------------------------------------------------------------------===//
 // HINT Instructions
 //===----------------------------------------------------------------------===//
@@ -767,20 +764,17 @@ def : InstAlias<".insn_cj $opcode, $funct3, $imm11",
 // Compress Instruction tablegen backend.
 //===----------------------------------------------------------------------===//
 
-// Patterns are defined in the same order the compressed instructions appear
+// Zca patterns are defined in the same order the compressed instructions appear
 // under the "RVC Instruction Set Listings" section of the ISA manual.
 
+// Zca Instructions
+
 // Quadrant 0
 let Predicates = [HasStdExtZca] in {
 def : CompressPat<(ADDI GPRC:$rd, SP:$rs1, uimm10_lsb00nonzero:$imm),
                   (C_ADDI4SPN GPRC:$rd, SP:$rs1, uimm10_lsb00nonzero:$imm)>;
 } // Predicates = [HasStdExtZca]
 
-let Predicates = [HasStdExtCOrZcd, HasStdExtD] in {
-def : CompressPat<(FLD FPR64C:$rd, GPRCMem:$rs1, uimm8_lsb000:$imm),
-                  (C_FLD FPR64C:$rd, GPRCMem:$rs1, uimm8_lsb000:$imm)>;
-} // Predicates = [HasStdExtCOrZcd, HasStdExtD]
-
 let Predicates = [HasStdExtZca] in {
 def : CompressPat<(LW GPRC:$rd, GPRCMem:$rs1, uimm7_lsb00:$imm),
                   (C_LW GPRC:$rd, GPRCMem:$rs1, uimm7_lsb00:$imm)>;
@@ -790,21 +784,11 @@ def : CompressPat<(LW_INX GPRF32C:$rd, GPRCMem:$rs1, uimm7_lsb00:$imm),
                   (C_LW_INX GPRF32C:$rd, GPRCMem:$rs1, uimm7_lsb00:$imm)>;
 } // Predicates = [HasStdExtZca]
 
-let Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in {
-def : CompressPat<(FLW FPR32C:$rd, GPRCMem:$rs1, uimm7_lsb00:$imm),
-                  (C_FLW FPR32C:$rd, GPRCMem:$rs1, uimm7_lsb00:$imm)>;
-} // Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32]
-
 let Predicates = [HasStdExtZca, IsRV64] in {
 def : CompressPat<(LD GPRC:$rd, GPRCMem:$rs1, uimm8_lsb000:$imm),
                   (C_LD GPRC:$rd, GPRCMem:$rs1, uimm8_lsb000:$imm)>;
 } // Predicates = [HasStdExtZca, IsRV64]
 
-let Predicates = [HasStdExtCOrZcd, HasStdExtD] in {
-def : CompressPat<(FSD FPR64C:$rs2, GPRCMem:$rs1, uimm8_lsb000:$imm),
-                  (C_FSD FPR64C:$rs2, GPRCMem:$rs1, uimm8_lsb000:$imm)>;
-} // Predicates = [HasStdExtCOrZcd, HasStdExtD]
-
 let Predicates = [HasStdExtZca] in {
 def : CompressPat<(SW GPRC:$rs2, GPRCMem:$rs1, uimm7_lsb00:$imm),
                   (C_SW GPRC:$rs2, GPRCMem:$rs1, uimm7_lsb00:$imm)>;
@@ -814,11 +798,6 @@ def : CompressPat<(SW_INX GPRF32C:$rs2, GPRCMem:$rs1, uimm7_lsb00:$imm),
                   (C_SW_INX GPRF32C:$rs2, GPRCMem:$rs1, uimm7_lsb00:$imm)>;
 } // Predicates = [HasStdExtZca]
 
-let Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in {
-def : CompressPat<(FSW FPR32C:$rs2, GPRCMem:$rs1, uimm7_lsb00:$imm),
-                  (C_FSW FPR32C:$rs2, GPRCMem:$rs1, uimm7_lsb00:$imm)>;
-} // Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32]
-
 let Predicates = [HasStdExtZca, IsRV64] in {
 def : CompressPat<(SD GPRC:$rs2, GPRCMem:$rs1, uimm8_lsb000:$imm),
                   (C_SD GPRC:$rs2, GPRCMem:$rs1, uimm8_lsb000:$imm)>;
@@ -907,11 +886,6 @@ def : CompressPat<(SLLI GPRNoX0:$rs1, GPRNoX0:$rs1, uimmlog2xlennonzero:$imm),
                   (C_SLLI GPRNoX0:$rs1, uimmlog2xlennonzero:$imm)>;
 } // Predicates = [HasStdExtZca]
 
-let Predicates = [HasStdExtCOrZcd, HasStdExtD] in {
-def : CompressPat<(FLD FPR64:$rd, SPMem:$rs1, uimm9_lsb000:$imm),
-                  (C_FLDSP FPR64:$rd, SPMem:$rs1, uimm9_lsb000:$imm)>;
-} // Predicates = [HasStdExtCOrZcd, HasStdExtD]
-
 let Predicates = [HasStdExtZca] in {
 def : CompressPat<(LW GPRNoX0:$rd, SPMem:$rs1,  uimm8_lsb00:$imm),
                   (C_LWSP GPRNoX0:$rd, SPMem:$rs1, uimm8_lsb00:$imm)>;
@@ -921,11 +895,6 @@ def : CompressPat<(LW_INX GPRF32NoX0:$rd, SPMem:$rs1,  uimm8_lsb00:$imm),
                   (C_LWSP_INX GPRF32NoX0:$rd, SPMem:$rs1, uimm8_lsb00:$imm)>;
 } // Predicates = [HasStdExtZca]
 
-let Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in {
-def : CompressPat<(FLW FPR32:$rd, SPMem:$rs1, uimm8_lsb00:$imm),
-                  (C_FLWSP FPR32:$rd, SPMem:$rs1, uimm8_lsb00:$imm)>;
-} // Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32]
-
 let Predicates = [HasStdExtZca, IsRV64] in {
 def : CompressPat<(LD GPRNoX0:$rd, SPMem:$rs1, uimm9_lsb000:$imm),
                   (C_LDSP GPRNoX0:$rd, SPMem:$rs1, uimm9_lsb000:$imm)>;
@@ -953,11 +922,6 @@ def : CompressPat<(ADD GPRNoX0:$rs1, GPRNoX0:$rs2, GPRNoX0:$rs1),
                   (C_ADD GPRNoX0:$rs1, GPRNoX0:$rs2)>;
 } // Predicates = [HasStdExtZca]
 
-let Predicates = [HasStdExtCOrZcd, HasStdExtD] in {
-def : CompressPat<(FSD FPR64:$rs2, SPMem:$rs1, uimm9_lsb000:$imm),
-                  (C_FSDSP FPR64:$rs2, SPMem:$rs1, uimm9_lsb000:$imm)>;
-} // Predicates = [HasStdExtCOrZcd, HasStdExtD]
-
 let Predicates = [HasStdExtZca] in {
 def : CompressPat<(SW GPR:$rs2, SPMem:$rs1, uimm8_lsb00:$imm),
                   (C_SWSP GPR:$rs2, SPMem:$rs1, uimm8_lsb00:$imm)>;
@@ -967,12 +931,38 @@ def : CompressPat<(SW_INX GPRF32:$rs2, SPMem:$rs1, uimm8_lsb00:$imm),
                   (C_SWSP_INX GPRF32:$rs2, SPMem:$rs1, uimm8_lsb00:$imm)>;
 } // Predicates = [HasStdExtZca]
 
-let Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in {
-def : CompressPat<(FSW FPR32:$rs2, SPMem:$rs1, uimm8_lsb00:$imm),
-                  (C_FSWSP FPR32:$rs2, SPMem:$rs1, uimm8_lsb00:$imm)>;
-} // Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32]
-
 let Predicates = [HasStdExtZca, IsRV64] in {
 def : CompressPat<(SD GPR:$rs2, SPMem:$rs1, uimm9_lsb000:$imm),
                   (C_SDSP GPR:$rs2, SPMem:$rs1, uimm9_lsb000:$imm)>;
 } // Predicates = [HasStdExtZca, IsRV64]
+
+// Zcf Instructions
+let Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in {
+  // Quadrant 0
+  def : CompressPat<(FLW FPR32C:$rd, GPRCMem:$rs1, uimm7_lsb00:$imm),
+                    (C_FLW FPR32C:$rd, GPRCMem:$rs1, uimm7_lsb00:$imm)>;
+  def : CompressPat<(FSW FPR32C:$rs2, GPRCMem:$rs1, uimm7_lsb00:$imm),
+                    (C_FSW FPR32C:$rs2, GPRCMem:$rs1, uimm7_lsb00:$imm)>;
+
+  // Quadrant 2
+  def : CompressPat<(FLW FPR32:$rd, SPMem:$rs1, uimm8_lsb00:$imm),
+                    (C_FLWSP FPR32:$rd, SPMem:$rs1, uimm8_lsb00:$imm)>;
+  def : CompressPat<(FSW FPR32:$rs2, SPMem:$rs1, uimm8_lsb00:$imm),
+                    (C_FSWSP FPR32:$rs2, SPMem:$rs1, uimm8_lsb00:$imm)>;
+} // Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32]
+
+// Zcd Instructions
+let Predicates = [HasStdExtCOrZcd, HasStdExtD] in {
+  // Quadrant 0
+  def : CompressPat<(FLD FPR64C:$rd, GPRCMem:$rs1, uimm8_lsb000:$imm),
+                    (C_FLD FPR64C:$rd, GPRCMem:$rs1, uimm8_lsb000:$imm)>;
+  def : CompressPat<(FSD FPR64C:$rs2, GPRCMem:$rs1, uimm8_lsb000:$imm),
+                    (C_FSD FPR64C:$rs2, GPRCMem:$rs1, uimm8_lsb000:$imm)>;
+
+  // Quadrant 2
+  def : CompressPat<(FLD FPR64:$rd, SPMem:$rs1, uimm9_lsb000:$imm),
+                    (C_FLDSP FPR64:$rd, SPMem:$rs1, uimm9_lsb000:$imm)>;
+  def : CompressPat<(FSD FPR64:$rs2, SPMem:$rs1, uimm9_lsb000:$imm),
+                    (C_FSDSP FPR64:$rs2, SPMem:$rs1, uimm9_lsb000:$imm)>;
+} // Predicates = [HasStdExtCOrZcd, HasStdExtD]
+
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
index c342b41e41d0..6840dacaea54 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
@@ -25,7 +25,7 @@ def SImm8UnsignedAsmOperand : SImmAsmOperand<8, "Unsigned"> {
 }
 
 // A 8-bit signed immediate allowing range [-128, 255]
-// but represented as [-128, 255].
+// but represented as [-128, 127].
 def simm8_unsigned : RISCVOp {
   let ParserMatchClass = SImm8UnsignedAsmOperand;
   let EncoderMethod = "getImmOpValue";
@@ -98,6 +98,40 @@ class PLUI_i<bits<7> funct7, string opcodestr>
   let Inst{23-15} = imm10{9-1};
 }
 
+// Common base for widening Binary/Ternary ops
+class RVPWideningBase<bits<2> w, bit arith_shift, dag outs, dag ins,
+                      string opcodestr>
+  : RVInst<outs, ins, opcodestr, "$rd, $rs1, $rs2", [], InstFormatOther> {
+  bits<5> rs2;
+  bits<5> rs1;
+  bits<5> rd;
+
+  let Inst{31}    = 0b0;
+  let Inst{26-25} = w;
+  let Inst{24-20} = rs2;
+  let Inst{19-15} = rs1;
+  let Inst{14-12} = 0b010;
+  let Inst{11-8}  = rd{4-1};
+  let Inst{7}     = arith_shift;
+  let Inst{6-0}   = OPC_OP_IMM_32.Value;
+}
+
+// Common base for narrowing ops
+class RVPNarrowingBase<bits<3> f, bit r, bits<4> funct4, dag outs, dag ins,
+                      string opcodestr, string argstr>
+  : RVInst<outs, ins, opcodestr, argstr, [], InstFormatOther> {
+  bits<5> rs1;
+  bits<5> rd;
+
+  let Inst{31}    = 0b0;
+  let Inst{30-28} = f;
+  let Inst{27}    = r;
+  let Inst{19-16} = rs1{4-1};
+  let Inst{15-12} = funct4;
+  let Inst{11-7}  = rd;
+  let Inst{6-0}   = OPC_OP_IMM_32.Value;
+}
+
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 class RVPShift_ri<bits<3> f, bits<3> funct3, string opcodestr, Operand ImmType>
     : RVInstIBase<funct3, OPC_OP_IMM_32, (outs GPR:$rd),
@@ -141,6 +175,100 @@ class RVPShiftB_ri<bits<3> f, bits<3> funct3, string opcodestr>
 }
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVPWideningShift_ri<bits<3> f, string opcodestr, Operand ImmType>
+    : RVInst<(outs GPRPairRV32:$rd), (ins GPR:$rs1, ImmType:$shamt), opcodestr,
+             "$rd, $rs1, $shamt", [], InstFormatOther> {
+  bits<5> rs1;
+  bits<5> rd;
+
+  let Inst{31}    = 0b0;
+  let Inst{30-28} = f;
+  let Inst{27}    = 0b0;
+  let Inst{19-15} = rs1;
+  let Inst{14-12} = 0b010;
+  let Inst{11-8}  = rd{4-1};
+  let Inst{7}     = 0b0;
+  let Inst{6-0}   = OPC_OP_IMM_32.Value;
+
+  let hasSideEffects = 0;
+  let mayLoad = 0;
+  let mayStore = 0;
+}
+
+class RVPWideningShiftW_ri<bits<3> f, string opcodestr>
+    : RVPWideningShift_ri<f, opcodestr, uimm6> {
+  bits<6> shamt;
+
+  let Inst{26} = 0b1;
+  let Inst{25-20} = shamt;
+}
+
+class RVPWideningShiftH_ri<bits<3> f, string opcodestr>
+    : RVPWideningShift_ri<f, opcodestr, uimm5> {
+  bits<5> shamt;
+
+  let Inst{26-25} = 0b01;
+  let Inst{24-20} = shamt;
+}
+
+class RVPWideningShiftB_ri<bits<3> f, string opcodestr>
+    : RVPWideningShift_ri<f, opcodestr, uimm4> {
+  bits<4> shamt;
+
+  let Inst{26-24} = 0b001;
+  let Inst{23-20} = shamt;
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVPNarrowingShift_ri<bits<3> f, string opcodestr, Operand ImmType>
+    : RVPNarrowingBase<f, 0b0, 0b1100, (outs GPR:$rd),
+                       (ins GPRPairRV32:$rs1, ImmType:$shamt), opcodestr,
+                       "$rd, $rs1, $shamt">;
+
+class RVPNarrowingShiftW_ri<bits<3> f, string opcodestr>
+    : RVPNarrowingShift_ri<f, opcodestr, uimm6> {
+  bits<6> shamt;
+
+  let Inst{26} = 0b1;
+  let Inst{25-20} = shamt;
+}
+
+class RVPNarrowingShiftH_ri<bits<3> f, string opcodestr>
+    : RVPNarrowingShift_ri<f, opcodestr, uimm5> {
+  bits<5> shamt;
+
+  let Inst{26-25} = 0b01;
+  let Inst{24-20} = shamt;
+}
+
+class RVPNarrowingShiftB_ri<bits<3> f, string opcodestr>
+    : RVPNarrowingShift_ri<f, opcodestr, uimm4> {
+  bits<4> shamt;
+
+  let Inst{26-24} = 0b001;
+  let Inst{23-20} = shamt;
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVPNarrowingShift_rr<bits<3> f, bits<2> w, string opcodestr>
+    : RVPNarrowingBase<f, 0b1, 0b1100, (outs GPR:$rd),
+                       (ins GPRPairRV32:$rs1, GPR:$rs2), opcodestr,
+                       "$rd, $rs1, $rs2"> {
+  bits<5> rs2;
+
+  let Inst{26-25} = w;
+  let Inst{24-20} = rs2;
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVPWideningShift_rr<bits<3> f, bits<2> w, string opcodestr>
+    : RVPWideningBase<w, 0b0, (outs GPRPairRV32:$rd), (ins GPR:$rs1, GPR:$rs2),
+                      opcodestr> {
+  let Inst{30-28} = f;
+  let Inst{27} = 0b1;
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 class RVPUnary_ri<bits<2> w, bits<5> uf, string opcodestr>
     : RVInstIBase<0b010, OPC_OP_IMM_32, (outs GPR:$rd), (ins GPR:$rs1),
                   opcodestr, "$rd, $rs1">  {
@@ -169,6 +297,24 @@ class RVPBinary_rr<bits<4> f, bits<2> w, bits<3> funct3, string opcodestr>
 }
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVPWideningBinary_rr<bits<4> f, bits<2> w, string opcodestr>
+    : RVPWideningBase<w, 0b1, (outs GPRPairRV32:$rd), (ins GPR:$rs1, GPR:$rs2),
+                      opcodestr> {
+  let Inst{30-27} = f;
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVPNarrowingBinary_rr<bits<3> f, bits<2> w, string opcodestr>
+    : RVPNarrowingBase<f, 0b1, 0b0100, (outs GPR:$rd),
+                       (ins GPRPairRV32:$rs1, GPR:$rs2), opcodestr,
+                       "$rd, $rs1, $rs2"> {
+  bits<5> rs2;
+
+  let Inst{26-25} = w;
+  let Inst{24-20} = rs2;
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 class RVPTernary_rrr<bits<4> f, bits<2> w, bits<3> funct3, string opcodestr>
     : RVInstRBase<funct3, OPC_OP_32, (outs GPR:$rd_wb),
                   (ins GPR:$rd, GPR:$rs1, GPR:$rs2), opcodestr,
@@ -180,6 +326,15 @@ class RVPTernary_rrr<bits<4> f, bits<2> w, bits<3> funct3, string opcodestr>
   let Constraints = "$rd = $rd_wb";
 }
 
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVPWideningTernary_rrr<bits<4> f, bits<2> w, string opcodestr>
+    : RVPWideningBase<w, 0b1, (outs GPRPairRV32:$rd_wb),
+                     (ins GPR:$rd, GPR:$rs1, GPR:$rs2), opcodestr> {
+  let Inst{30-27} = f;
+
+  let Constraints = "$rd = $rd_wb";
+}
+
 // Common base for pli.db/h/w and plui.dh/w
 class RVPPairLoadImm_i<bits<7> funct7, dag ins, string opcodestr,
                        string argstr>
@@ -889,3 +1044,156 @@ let Predicates = [HasStdExtP, IsRV32] in {
     let Inst{23-15} = imm10{9-1};
   }
 }
+
+let Predicates = [HasStdExtP, IsRV32] in {
+  def PWSLLI_B     : RVPWideningShiftB_ri<0b000, "pwslli.b">;
+  def PWSLLI_H     : RVPWideningShiftH_ri<0b000, "pwslli.h">;
+  def WSLLI        : RVPWideningShiftW_ri<0b000, "wslli">;
+
+  def PWSLAI_B     : RVPWideningShiftB_ri<0b100, "pwslai.b">;
+  def PWSLAI_H     : RVPWideningShiftH_ri<0b100, "pwslai.h">;
+  def WSLAI        : RVPWideningShiftW_ri<0b100, "wslai">;
+
+  def PWSLL_BS     : RVPWideningShift_rr<0b000, 0b00, "pwsll.bs">;
+  def PWSLL_HS     : RVPWideningShift_rr<0b000, 0b01, "pwsll.hs">;
+  def WSLL         : RVPWideningShift_rr<0b000, 0b11, "wsll">;
+
+  def PWSLA_BS     : RVPWideningShift_rr<0b100, 0b00, "pwsla.bs">;
+  def PWSLA_HS     : RVPWideningShift_rr<0b100, 0b01, "pwsla.hs">;
+  def WSLA         : RVPWideningShift_rr<0b100, 0b11, "wsla">;
+
+  def WZIP8P       : RVPWideningShift_rr<0b111, 0b00, "wzip8p">;
+  def WZIP16P      : RVPWideningShift_rr<0b111, 0b01, "wzip16p">;
+
+  def PWADD_H      : RVPWideningBinary_rr<0b0000, 0b00, "pwadd.h">;
+  def WADD         : RVPWideningBinary_rr<0b0000, 0b01, "wadd">;
+  def PWADD_B      : RVPWideningBinary_rr<0b0000, 0b10, "pwadd.b">;
+  def PM2WADD_H    : RVPWideningBinary_rr<0b0000, 0b11, "pm2wadd.h">;
+
+  def PWADDA_H     : RVPWideningTernary_rrr<0b0001, 0b00, "pwadda.h">;
+  def WADDA        : RVPWideningTernary_rrr<0b0001, 0b01, "wadda">;
+  def PWADDA_B     : RVPWideningTernary_rrr<0b0001, 0b10, "pwadda.b">;
+  def PM2WADDA_H   : RVPWideningTernary_rrr<0b0001, 0b11, "pm2wadda.h">;
+
+  def PWADDU_H     : RVPWideningBinary_rr<0b0010, 0b00, "pwaddu.h">;
+  def WADDU        : RVPWideningBinary_rr<0b0010, 0b01, "waddu">;
+  def PWADDU_B     : RVPWideningBinary_rr<0b0010, 0b10, "pwaddu.b">;
+  def PM2WADD_HX   : RVPWideningBinary_rr<0b0010, 0b11, "pm2wadd.hx">;
+
+  def PWADDAU_H    : RVPWideningTernary_rrr<0b0011, 0b00, "pwaddau.h">;
+  def WADDAU       : RVPWideningTernary_rrr<0b0011, 0b01, "waddau">;
+  def PWADDAU_B    : RVPWideningTernary_rrr<0b0011, 0b10, "pwaddau.b">;
+  def PM2WADDA_HX  : RVPWideningTernary_rrr<0b0011, 0b11, "pm2wadda.hx">;
+
+  def PWMUL_H      : RVPWideningBinary_rr<0b0100, 0b00, "pwmul.h">;
+  def WMUL         : RVPWideningBinary_rr<0b0100, 0b01, "wmul">;
+  def PWMUL_B      : RVPWideningBinary_rr<0b0100, 0b10, "pwmul.b">;
+  def PM2WADDU_H   : RVPWideningBinary_rr<0b0100, 0b11, "pm2waddu.h">;
+
+  def PWMACC_H     : RVPWideningTernary_rrr<0b0101, 0b00, "pwmacc.h">;
+  def WMACC        : RVPWideningTernary_rrr<0b0101, 0b01, "wmacc">;
+  def PM2WADDAU_H  : RVPWideningTernary_rrr<0b0101, 0b11, "pm2waddau.h">;
+
+  def PWMULU_H     : RVPWideningBinary_rr<0b0110, 0b00, "pwmulu.h">;
+  def WMULU        : RVPWideningBinary_rr<0b0110, 0b01, "wmulu">;
+  def PWMULU_B     : RVPWideningBinary_rr<0b0110, 0b10, "pwmulu.b">;
+
+  def PWMACCU_H    : RVPWideningTernary_rrr<0b0111, 0b00, "pwmaccu.h">;
+  def WMACCU       : RVPWideningTernary_rrr<0b0111, 0b01, "wmaccu">;
+
+  def PWSUB_H      : RVPWideningBinary_rr<0b1000, 0b00, "pwsub.h">;
+  def WSUB         : RVPWideningBinary_rr<0b1000, 0b01, "wsub">;
+  def PWSUB_B      : RVPWideningBinary_rr<0b1000, 0b10, "pwsub.b">;
+  def PM2WSUB_H    : RVPWideningBinary_rr<0b1000, 0b11, "pm2wsub.h">;
+
+  def PWSUBA_H     : RVPWideningTernary_rrr<0b1001, 0b00, "pwsuba.h">;
+  def WSUBA        : RVPWideningTernary_rrr<0b1001, 0b01, "wsuba">;
+  def PWSUBA_B     : RVPWideningTernary_rrr<0b1001, 0b10, "pwsuba.b">;
+  def PM2WSUBA_H   : RVPWideningTernary_rrr<0b1001, 0b11, "pm2wsuba.h">;
+
+  def PWSUBU_H     : RVPWideningBinary_rr<0b1010, 0b00, "pwsubu.h">;
+  def WSUBU        : RVPWideningBinary_rr<0b1010, 0b01, "wsubu">;
+  def PWSUBU_B     : RVPWideningBinary_rr<0b1010, 0b10, "pwsubu.b">;
+  def PM2WSUB_HX   : RVPWideningBinary_rr<0b1010, 0b11, "pm2wsub.hx">;
+
+  def PWSUBAU_H    : RVPWideningTernary_rrr<0b1011, 0b00, "pwsubau.h">;
+  def WSUBAU       : RVPWideningTernary_rrr<0b1011, 0b01, "wsubau">;
+  def PWSUBAU_B    : RVPWideningTernary_rrr<0b1011, 0b10, "pwsubau.b">;
+  def PM2WSUBA_HX  : RVPWideningTernary_rrr<0b1011, 0b11, "pm2wsuba.hx">;
+  
+  def PWMULSU_H    : RVPWideningBinary_rr<0b1100, 0b00, "pwmulsu.h">;
+  def WMULSU       : RVPWideningBinary_rr<0b1100, 0b01, "wmulsu">;
+  def PWMULSU_B    : RVPWideningBinary_rr<0b1100, 0b10, "pwmulsu.b">;
+  def PM2WADDSU_H  : RVPWideningBinary_rr<0b1100, 0b11, "pm2waddsu.h">;
+
+  def PWMACCSU_H   : RVPWideningTernary_rrr<0b1101, 0b00, "pwmaccsu.h">;
+  def WMACCSU      : RVPWideningTernary_rrr<0b1101, 0b01, "wmaccsu">;
+  def PM2WADDASU_H : RVPWideningTernary_rrr<0b1101, 0b11, "pm2waddasu.h">;
+
+  def PMQWACC_H    : RVPWideningTernary_rrr<0b1111, 0b00, "pmqwacc.h">;
+  def PMQWACC      : RVPWideningTernary_rrr<0b1111, 0b01, "pmqwacc">;
+  def PMQRWACC_H   : RVPWideningTernary_rrr<0b1111, 0b10, "pmqrwacc.h">;
+  def PMQRWACC     : RVPWideningTernary_rrr<0b1111, 0b11, "pmqrwacc">;
+
+  def PREDSUM_DHS  : RVPNarrowingBinary_rr<0b001, 0b00, "predsum.dhs">;
+  def PREDSUM_DBS  : RVPNarrowingBinary_rr<0b001, 0b10, "predsum.dbs">;
+
+  def PREDSUMU_DHS : RVPNarrowingBinary_rr<0b011, 0b00, "predsumu.dhs">;
+  def PREDSUMU_DBS : RVPNarrowingBinary_rr<0b011, 0b10, "predsumu.dbs">;
+
+  def PNSRLI_B     : RVPNarrowingShiftB_ri<0b000, "pnsrli.b">;
+  def PNSRLI_H     : RVPNarrowingShiftH_ri<0b000, "pnsrli.h">;
+  def NSRLI        : RVPNarrowingShiftW_ri<0b000, "nsrli">;
+
+  def PNCLIPIU_B   : RVPNarrowingShiftB_ri<0b010, "pnclipiu.b">;
+  def PNCLIPIU_H   : RVPNarrowingShiftH_ri<0b010, "pnclipiu.h">;
+  def NCLIPIU      : RVPNarrowingShiftW_ri<0b010, "nclipiu">;
+
+  def PNCLIPRIU_B  : RVPNarrowingShiftB_ri<0b011, "pnclipriu.b">;
+  def PNCLIPRIU_H  : RVPNarrowingShiftH_ri<0b011, "pnclipriu.h">;
+  def NCLIPRIU     : RVPNarrowingShiftW_ri<0b011, "nclipriu">;
+
+  def PNSRAI_B     : RVPNarrowingShiftB_ri<0b100, "pnsrai.b">;
+  def PNSRAI_H     : RVPNarrowingShiftH_ri<0b100, "pnsrai.h">;
+  def NSRAI        : RVPNarrowingShiftW_ri<0b100, "nsrai">;
+
+  def PNSARI_B     : RVPNarrowingShiftB_ri<0b101, "pnsari.b">;
+  def PNSARI_H     : RVPNarrowingShiftH_ri<0b101, "pnsari.h">;
+  def NSARI        : RVPNarrowingShiftW_ri<0b101, "nsari">;
+
+  def PNCLIPI_B    : RVPNarrowingShiftB_ri<0b110, "pnclipi.b">;
+  def PNCLIPI_H    : RVPNarrowingShiftH_ri<0b110, "pnclipi.h">;
+  def NCLIPI       : RVPNarrowingShiftW_ri<0b110, "nclipi">;
+
+  def PNCLIPRI_B   : RVPNarrowingShiftB_ri<0b111, "pnclipri.b">;
+  def PNCLIPRI_H   : RVPNarrowingShiftH_ri<0b111, "pnclipri.h">;
+  def NCLIPRI      : RVPNarrowingShiftW_ri<0b111, "nclipri">;
+
+  def PNSRL_BS     : RVPNarrowingShift_rr<0b000, 0b00, "pnsrl.bs">;
+  def PNSRL_HS     : RVPNarrowingShift_rr<0b000, 0b01, "pnsrl.hs">;
+  def NSRL         : RVPNarrowingShift_rr<0b000, 0b11, "nsrl">;
+
+  def PNCLIPU_BS   : RVPNarrowingShift_rr<0b010, 0b00, "pnclipu.bs">;
+  def PNCLIPU_HS   : RVPNarrowingShift_rr<0b010, 0b01, "pnclipu.hs">;
+  def NCLIPU       : RVPNarrowingShift_rr<0b010, 0b11, "nclipu">;
+
+  def PNCLIPRU_BS  : RVPNarrowingShift_rr<0b011, 0b00, "pnclipru.bs">;
+  def PNCLIPRU_HS  : RVPNarrowingShift_rr<0b011, 0b01, "pnclipru.hs">;
+  def NCLIPRU      : RVPNarrowingShift_rr<0b011, 0b11, "nclipru">;
+
+  def PNSRA_BS     : RVPNarrowingShift_rr<0b100, 0b00, "pnsra.bs">;
+  def PNSRA_HS     : RVPNarrowingShift_rr<0b100, 0b01, "pnsra.hs">;
+  def NSRA         : RVPNarrowingShift_rr<0b100, 0b11, "nsra">;
+
+  def PNSRAR_BS    : RVPNarrowingShift_rr<0b101, 0b00, "pnsrar.bs">;
+  def PNSRAR_HS    : RVPNarrowingShift_rr<0b101, 0b01, "pnsrar.hs">;
+  def NSRAR        : RVPNarrowingShift_rr<0b101, 0b11, "nsrar">;
+
+  def PNCLIP_BS    : RVPNarrowingShift_rr<0b110, 0b00, "pnclip.bs">;
+  def PNCLIP_HS    : RVPNarrowingShift_rr<0b110, 0b01, "pnclip.hs">;
+  def NCLIP        : RVPNarrowingShift_rr<0b110, 0b11, "nclip">;
+
+  def PNCLIPR_BS   : RVPNarrowingShift_rr<0b111, 0b00, "pnclipr.bs">;
+  def PNCLIPR_HS   : RVPNarrowingShift_rr<0b111, 0b01, "pnclipr.hs">;
+  def NCLIPR       : RVPNarrowingShift_rr<0b111, 0b11, "nclipr">;
+} // Predicates = [HasStdExtP, IsRV32]
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td
index 32f533b8f114..f732ab13e5f8 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td
@@ -44,153 +44,95 @@ def PseudoCCMOVGPRNoX0 : Pseudo<(outs GPRNoX0:$dst),
                          Sched<[]>;
 }
 
+class SFBALU_rr
+    : Pseudo<(outs GPR:$dst),
+             (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, GPR:$falsev, GPR:$rs1,
+                  GPR:$rs2), []>,
+      Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, ReadSFBALU,
+             ReadSFBALU]> {
+  let hasSideEffects = 0;
+  let mayLoad = 0;
+  let mayStore = 0;
+  let Size = 8;
+  let Constraints = "$dst = $falsev";
+}
+
+class SFBALU_ri
+    : Pseudo<(outs GPR:$dst),
+             (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, GPR:$falsev, GPR:$rs1,
+                  simm12:$imm), []>,
+      Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, ReadSFBALU]> {
+  let hasSideEffects = 0;
+  let mayLoad = 0;
+  let mayStore = 0;
+  let Size = 8;
+  let Constraints = "$dst = $falsev";
+}
+
+class SFBShift_ri
+    : Pseudo<(outs GPR:$dst),
+             (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, GPR:$falsev, GPR:$rs1,
+                  uimmlog2xlen:$imm), []>,
+      Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, ReadSFBALU]> {
+  let hasSideEffects = 0;
+  let mayLoad = 0;
+  let mayStore = 0;
+  let Size = 8;
+  let Constraints = "$dst = $falsev";
+}
+
+class SFBShiftW_ri
+    : Pseudo<(outs GPR:$dst),
+             (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, GPR:$falsev, GPR:$rs1,
+                  uimm5:$imm), []>,
+      Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, ReadSFBALU]> {
+  let hasSideEffects = 0;
+  let mayLoad = 0;
+  let mayStore = 0;
+  let Size = 8;
+  let Constraints = "$dst = $falsev";
+}
+
 // Conditional binops, that updates update $dst to (op rs1, rs2) when condition
 // is true. Returns $falsev otherwise. Selected by optimizeSelect.
 // TODO: Can we use DefaultOperands on the regular binop to accomplish this more
 // like how ARM does predication?
-let Predicates = [HasShortForwardBranchOpt], hasSideEffects = 0,
-    mayLoad = 0, mayStore = 0, Size = 8, Constraints = "$dst = $falsev" in {
-def PseudoCCADD : Pseudo<(outs GPR:$dst),
-                         (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
-                          GPR:$falsev, GPR:$rs1, GPR:$rs2), []>,
-                  Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp,
-                         ReadSFBALU, ReadSFBALU, ReadSFBALU]>;
-def PseudoCCSUB : Pseudo<(outs GPR:$dst),
-                         (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
-                          GPR:$falsev, GPR:$rs1, GPR:$rs2), []>,
-                  Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp,
-                         ReadSFBALU, ReadSFBALU, ReadSFBALU]>;
-def PseudoCCSLL : Pseudo<(outs GPR:$dst),
-                         (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
-                          GPR:$falsev, GPR:$rs1, GPR:$rs2), []>,
-                  Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
-                         ReadSFBALU, ReadSFBALU]>;
-def PseudoCCSRL : Pseudo<(outs GPR:$dst),
-                         (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
-                          GPR:$falsev, GPR:$rs1, GPR:$rs2), []>,
-                  Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
-                         ReadSFBALU, ReadSFBALU]>;
-def PseudoCCSRA : Pseudo<(outs GPR:$dst),
-                         (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
-                          GPR:$falsev, GPR:$rs1, GPR:$rs2), []>,
-                  Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
-                         ReadSFBALU, ReadSFBALU]>;
-def PseudoCCAND : Pseudo<(outs GPR:$dst),
-                         (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
-                          GPR:$falsev, GPR:$rs1, GPR:$rs2), []>,
-                  Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp,
-                         ReadSFBALU, ReadSFBALU, ReadSFBALU]>;
-def PseudoCCOR  : Pseudo<(outs GPR:$dst),
-                         (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
-                          GPR:$falsev, GPR:$rs1, GPR:$rs2), []>,
-                  Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp,
-                         ReadSFBALU, ReadSFBALU, ReadSFBALU]>;
-def PseudoCCXOR : Pseudo<(outs GPR:$dst),
-                         (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
-                          GPR:$falsev, GPR:$rs1, GPR:$rs2), []>,
-                  Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp,
-                         ReadSFBALU, ReadSFBALU, ReadSFBALU]>;
+let Predicates = [HasShortForwardBranchOpt] in {
+def PseudoCCADD : SFBALU_rr;
+def PseudoCCSUB : SFBALU_rr;
+def PseudoCCSLL : SFBALU_rr;
+def PseudoCCSRL : SFBALU_rr;
+def PseudoCCSRA : SFBALU_rr;
+def PseudoCCAND : SFBALU_rr;
+def PseudoCCOR  : SFBALU_rr;
+def PseudoCCXOR : SFBALU_rr;
 
-def PseudoCCADDI : Pseudo<(outs GPR:$dst),
-                          (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
-                           GPR:$falsev, GPR:$rs1, simm12:$rs2), []>,
-                   Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
-                          ReadSFBALU]>;
-def PseudoCCSLLI : Pseudo<(outs GPR:$dst),
-                          (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
-                           GPR:$falsev, GPR:$rs1, uimmlog2xlen:$shamt), []>,
-                   Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
-                          ReadSFBALU]>;
-def PseudoCCSRLI : Pseudo<(outs GPR:$dst),
-                          (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
-                           GPR:$falsev, GPR:$rs1, uimmlog2xlen:$shamt), []>,
-                   Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
-                          ReadSFBALU]>;
-def PseudoCCSRAI : Pseudo<(outs GPR:$dst),
-                          (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
-                           GPR:$falsev, GPR:$rs1, uimmlog2xlen:$shamt), []>,
-                   Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
-                          ReadSFBALU]>;
-def PseudoCCANDI : Pseudo<(outs GPR:$dst),
-                          (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
-                           GPR:$falsev, GPR:$rs1, simm12:$rs2), []>,
-                   Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
-                          ReadSFBALU]>;
-def PseudoCCORI  : Pseudo<(outs GPR:$dst),
-                          (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
-                           GPR:$falsev, GPR:$rs1, simm12:$rs2), []>,
-                   Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
-                          ReadSFBALU]>;
-def PseudoCCXORI : Pseudo<(outs GPR:$dst),
-                          (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
-                           GPR:$falsev, GPR:$rs1, simm12:$rs2), []>,
-                   Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
-                          ReadSFBALU]>;
+def PseudoCCADDI : SFBALU_ri;
+def PseudoCCANDI : SFBALU_ri;
+def PseudoCCORI  : SFBALU_ri;
+def PseudoCCXORI : SFBALU_ri;
+
+def PseudoCCSLLI : SFBShift_ri;
+def PseudoCCSRLI : SFBShift_ri;
+def PseudoCCSRAI : SFBShift_ri;
 
 // RV64I instructions
-def PseudoCCADDW : Pseudo<(outs GPR:$dst),
-                          (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
-                           GPR:$falsev, GPR:$rs1, GPR:$rs2), []>,
-                   Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp,
-                          ReadSFBALU, ReadSFBALU, ReadSFBALU]>;
-def PseudoCCSUBW : Pseudo<(outs GPR:$dst),
-                          (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
-                           GPR:$falsev, GPR:$rs1, GPR:$rs2), []>,
-                   Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp,
-                          ReadSFBALU, ReadSFBALU, ReadSFBALU]>;
-def PseudoCCSLLW : Pseudo<(outs GPR:$dst),
-                          (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
-                           GPR:$falsev, GPR:$rs1, GPR:$rs2), []>,
-                   Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
-                          ReadSFBALU, ReadSFBALU]>;
-def PseudoCCSRLW : Pseudo<(outs GPR:$dst),
-                          (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
-                           GPR:$falsev, GPR:$rs1, GPR:$rs2), []>,
-                   Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
-                          ReadSFBALU, ReadSFBALU]>;
-def PseudoCCSRAW : Pseudo<(outs GPR:$dst),
-                          (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
-                           GPR:$falsev, GPR:$rs1, GPR:$rs2), []>,
-                   Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
-                          ReadSFBALU, ReadSFBALU]>;
+def PseudoCCADDW : SFBALU_rr;
+def PseudoCCSUBW : SFBALU_rr;
+def PseudoCCSLLW : SFBALU_rr;
+def PseudoCCSRLW : SFBALU_rr;
+def PseudoCCSRAW : SFBALU_rr;
+
+def PseudoCCADDIW : SFBALU_ri;
 
-def PseudoCCADDIW : Pseudo<(outs GPR:$dst),
-                           (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
-                            GPR:$falsev, GPR:$rs1, simm12:$rs2), []>,
-                    Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
-                           ReadSFBALU]>;
-def PseudoCCSLLIW : Pseudo<(outs GPR:$dst),
-                           (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
-                            GPR:$falsev, GPR:$rs1, uimm5:$shamt), []>,
-                    Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
-                           ReadSFBALU]>;
-def PseudoCCSRLIW : Pseudo<(outs GPR:$dst),
-                           (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
-                            GPR:$falsev, GPR:$rs1, uimm5:$shamt), []>,
-                    Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
-                           ReadSFBALU]>;
-def PseudoCCSRAIW : Pseudo<(outs GPR:$dst),
-                           (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
-                            GPR:$falsev, GPR:$rs1, uimm5:$shamt), []>,
-                    Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
-                           ReadSFBALU]>;
+def PseudoCCSLLIW : SFBShiftW_ri;
+def PseudoCCSRLIW : SFBShiftW_ri;
+def PseudoCCSRAIW : SFBShiftW_ri;
 
 // Zbb/Zbkb instructions
-def PseudoCCANDN : Pseudo<(outs GPR:$dst),
-                          (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
-                           GPR:$falsev, GPR:$rs1, GPR:$rs2), []>,
-                   Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp,
-                          ReadSFBALU, ReadSFBALU, ReadSFBALU]>;
-def PseudoCCORN : Pseudo<(outs GPR:$dst),
-                         (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
-                          GPR:$falsev, GPR:$rs1, GPR:$rs2), []>,
-                  Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp,
-                         ReadSFBALU, ReadSFBALU, ReadSFBALU]>;
-def PseudoCCXNOR : Pseudo<(outs GPR:$dst),
-                          (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
-                           GPR:$falsev, GPR:$rs1, GPR:$rs2), []>,
-                   Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp,
-                          ReadSFBALU, ReadSFBALU, ReadSFBALU]>;
+def PseudoCCANDN : SFBALU_rr;
+def PseudoCCORN  : SFBALU_rr;
+def PseudoCCXNOR : SFBALU_rr;
 }
 
 let Predicates = [HasShortForwardBranchOpt] in
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index acbccddce2b5..063ee5c5e8b9 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -830,19 +830,6 @@ multiclass VPatTiedBinaryNoMaskVL_V<SDNode vop,
                      result_reg_class:$rs1,
                      op2_reg_class:$rs2,
                      GPR:$vl, sew, TAIL_AGNOSTIC)>;
-  // Tail undisturbed
-  def : Pat<(riscv_vmerge_vl true_mask,
-             (result_type (vop
-                           result_reg_class:$rs1,
-                           (op2_type op2_reg_class:$rs2),
-                           srcvalue,
-                           true_mask,
-                           VLOpFrag)),
-             result_reg_class:$rs1, result_reg_class:$rs1, VLOpFrag),
-            (!cast<Instruction>(instruction_name#"_"#suffix#"_"# vlmul.MX#"_TIED")
-                     result_reg_class:$rs1,
-                     op2_reg_class:$rs2,
-                     GPR:$vl, sew, TU_MU)>;
 }
 
 class VPatTiedBinaryMaskVL_V<SDNode vop,
@@ -892,22 +879,6 @@ multiclass VPatTiedBinaryNoMaskVL_V_RM<SDNode vop,
                      // RISCVInsertReadWriteCSR
                      FRM_DYN,
                      GPR:$vl, log2sew, TAIL_AGNOSTIC)>;
-  // Tail undisturbed
-  def : Pat<(riscv_vmerge_vl true_mask,
-             (result_type (vop
-                           result_reg_class:$rs1,
-                           (op2_type op2_reg_class:$rs2),
-                           srcvalue,
-                           true_mask,
-                           VLOpFrag)),
-             result_reg_class:$rs1, result_reg_class:$rs1, VLOpFrag),
-            (!cast<Instruction>(name)
-                     result_reg_class:$rs1,
-                     op2_reg_class:$rs2,
-                     // Value to indicate no rounding mode change in
-                     // RISCVInsertReadWriteCSR
-                     FRM_DYN,
-                     GPR:$vl, log2sew, TU_MU)>;
 }
 
 class VPatBinaryVL_XI<SDPatternOperator vop,
@@ -1755,50 +1726,6 @@ multiclass VPatMultiplyAddVL_VV_VX<SDNode op, string instruction_name> {
   }
 }
 
-multiclass VPatMultiplyAccVL_VV_VX<PatFrag op, string instruction_name> {
-  foreach vti = AllIntegerVectors in {
-  defvar suffix = vti.LMul.MX;
-  let Predicates = GetVTypePredicates<vti>.Predicates in {
-    def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm),
-                (vti.Vector (op vti.RegClass:$rd,
-                                (riscv_mul_vl_oneuse vti.RegClass:$rs1, vti.RegClass:$rs2,
-                                    srcvalue, (vti.Mask true_mask), VLOpFrag),
-                                srcvalue, (vti.Mask true_mask), VLOpFrag)),
-                            vti.RegClass:$rd, vti.RegClass:$rd, VLOpFrag),
-              (!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK")
-                   vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
-                   (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TU_MU)>;
-    def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm),
-                (vti.Vector (op vti.RegClass:$rd,
-                                (riscv_mul_vl_oneuse (SplatPat XLenVT:$rs1), vti.RegClass:$rs2,
-                                    srcvalue, (vti.Mask true_mask), VLOpFrag),
-                                srcvalue, (vti.Mask true_mask), VLOpFrag)),
-                            vti.RegClass:$rd, vti.RegClass:$rd, VLOpFrag),
-              (!cast<Instruction>(instruction_name#"_VX_"# suffix #"_MASK")
-                   vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                   (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TU_MU)>;
-    def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm),
-                (vti.Vector (op vti.RegClass:$rd,
-                                (riscv_mul_vl_oneuse vti.RegClass:$rs1, vti.RegClass:$rs2,
-                                    srcvalue, (vti.Mask true_mask), VLOpFrag),
-                                srcvalue, (vti.Mask true_mask), VLOpFrag)),
-                            vti.RegClass:$rd, undef, VLOpFrag),
-              (!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK")
-                   vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
-                   (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
-    def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm),
-                (vti.Vector (op vti.RegClass:$rd,
-                                (riscv_mul_vl_oneuse (SplatPat XLenVT:$rs1), vti.RegClass:$rs2,
-                                    srcvalue, (vti.Mask true_mask), VLOpFrag),
-                                srcvalue, (vti.Mask true_mask), VLOpFrag)),
-                            vti.RegClass:$rd, undef, VLOpFrag),
-              (!cast<Instruction>(instruction_name#"_VX_"# suffix #"_MASK")
-                   vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                   (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
-    }
-  }
-}
-
 multiclass VPatWidenMultiplyAddVL_VV_VX<SDNode vwmacc_op, string instr_name> {
   foreach vtiTowti = AllWidenableIntVectors in {
     defvar vti = vtiTowti.Vti;
@@ -1898,82 +1825,6 @@ multiclass VPatFPMulAddVL_VV_VF_RM<SDPatternOperator vop, string instruction_nam
   }
 }
 
-multiclass VPatFPMulAccVL_VV_VF_RM<PatFrag vop, string instruction_name> {
-  foreach vti = AllFloatVectors in {
-  defvar suffix = vti.LMul.MX # "_E" # vti.SEW;
-  let Predicates = GetVTypePredicates<vti>.Predicates in {
-    def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm),
-                           (vti.Vector (vop vti.RegClass:$rs1, vti.RegClass:$rs2,
-                            vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)),
-                            vti.RegClass:$rd, vti.RegClass:$rd, VLOpFrag),
-              (!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK")
-                   vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
-                   (vti.Mask VMV0:$vm),
-                   // Value to indicate no rounding mode change in
-                   // RISCVInsertReadWriteCSR
-                   FRM_DYN,
-                   GPR:$vl, vti.Log2SEW, TU_MU)>;
-    def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm),
-                           (vti.Vector (vop (SplatFPOp vti.ScalarRegClass:$rs1), vti.RegClass:$rs2,
-                            vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)),
-                            vti.RegClass:$rd, vti.RegClass:$rd, VLOpFrag),
-              (!cast<Instruction>(instruction_name#"_V" # vti.ScalarSuffix # "_" # suffix # "_MASK")
-                   vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                   (vti.Mask VMV0:$vm),
-                   // Value to indicate no rounding mode change in
-                   // RISCVInsertReadWriteCSR
-                   FRM_DYN,
-                   GPR:$vl, vti.Log2SEW, TU_MU)>;
-    def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm),
-                           (vti.Vector (vop vti.RegClass:$rs1, vti.RegClass:$rs2,
-                            vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)),
-                            vti.RegClass:$rd, undef, VLOpFrag),
-              (!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK")
-                   vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
-                   (vti.Mask VMV0:$vm),
-                   // Value to indicate no rounding mode change in
-                   // RISCVInsertReadWriteCSR
-                   FRM_DYN,
-                   GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
-    def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm),
-                           (vti.Vector (vop (SplatFPOp vti.ScalarRegClass:$rs1), vti.RegClass:$rs2,
-                            vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)),
-                            vti.RegClass:$rd, undef, VLOpFrag),
-              (!cast<Instruction>(instruction_name#"_V" # vti.ScalarSuffix # "_" # suffix # "_MASK")
-                   vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                   (vti.Mask VMV0:$vm),
-                   // Value to indicate no rounding mode change in
-                   // RISCVInsertReadWriteCSR
-                   FRM_DYN,
-                   GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
-    }
-  }
-}
-
-multiclass VPatWidenFPMulAccVL_VV_VF<SDNode vop, string instruction_name> {
-  foreach vtiToWti = AllWidenableFloatVectors in {
-    defvar vti = vtiToWti.Vti;
-    defvar wti = vtiToWti.Wti;
-    let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates,
-                                 GetVTypePredicates<wti>.Predicates) in {
-      def : Pat<(vop (vti.Vector vti.RegClass:$rs1),
-                     (vti.Vector vti.RegClass:$rs2),
-                     (wti.Vector wti.RegClass:$rd), (vti.Mask VMV0:$vm),
-                     VLOpFrag),
-                (!cast<Instruction>(instruction_name#"_VV_"#vti.LMul.MX #"_MASK")
-                   wti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
-                   (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TA_MA)>;
-      def : Pat<(vop (vti.Vector (SplatFPOp vti.ScalarRegClass:$rs1)),
-                     (vti.Vector vti.RegClass:$rs2),
-                     (wti.Vector wti.RegClass:$rd), (vti.Mask VMV0:$vm),
-                     VLOpFrag),
-                (!cast<Instruction>(instruction_name#"_V"#vti.ScalarSuffix#"_"#vti.LMul.MX #"_MASK")
-                   wti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                   (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TA_MA)>;
-    }
-  }
-}
-
 multiclass VPatWidenFPMulAccVL_VV_VF_RM<SDNode vop, string instruction_name,
                                         list<VTypeInfoToWide> vtiToWtis =
                                         AllWidenableFloatVectors> {
@@ -2331,8 +2182,6 @@ defm : VPatBinaryWVL_VV_VX<riscv_vwmulsu_vl, "PseudoVWMULSU">;
 // 11.13 Vector Single-Width Integer Multiply-Add Instructions
 defm : VPatMultiplyAddVL_VV_VX<riscv_add_vl, "PseudoVMADD">;
 defm : VPatMultiplyAddVL_VV_VX<riscv_sub_vl, "PseudoVNMSUB">;
-defm : VPatMultiplyAccVL_VV_VX<riscv_add_vl_oneuse, "PseudoVMACC">;
-defm : VPatMultiplyAccVL_VV_VX<riscv_sub_vl_oneuse, "PseudoVNMSAC">;
 
 // 11.14. Vector Widening Integer Multiply-Add Instructions
 defm : VPatWidenMultiplyAddVL_VV_VX<riscv_vwmacc_vl, "PseudoVWMACC">;
@@ -2470,10 +2319,6 @@ defm : VPatFPMulAddVL_VV_VF_RM<any_riscv_vfmadd_vl,  "PseudoVFMADD">;
 defm : VPatFPMulAddVL_VV_VF_RM<any_riscv_vfmsub_vl,  "PseudoVFMSUB">;
 defm : VPatFPMulAddVL_VV_VF_RM<any_riscv_vfnmadd_vl, "PseudoVFNMADD">;
 defm : VPatFPMulAddVL_VV_VF_RM<any_riscv_vfnmsub_vl, "PseudoVFNMSUB">;
-defm : VPatFPMulAccVL_VV_VF_RM<riscv_vfmadd_vl_oneuse,  "PseudoVFMACC">;
-defm : VPatFPMulAccVL_VV_VF_RM<riscv_vfmsub_vl_oneuse,  "PseudoVFMSAC">;
-defm : VPatFPMulAccVL_VV_VF_RM<riscv_vfnmadd_vl_oneuse, "PseudoVFNMACC">;
-defm : VPatFPMulAccVL_VV_VF_RM<riscv_vfnmsub_vl_oneuse, "PseudoVFNMSAC">;
 
 // 13.7. Vector Widening Floating-Point Fused Multiply-Add Instructions
 defm : VPatWidenFPMulAccVL_VV_VF_RM<riscv_vfwmadd_vl, "PseudoVFWMACC">;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td
index 889ea9802257..d615094329b2 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td
@@ -125,10 +125,25 @@ class Mips_prefetch_ri<dag outs, dag ins, string opcodestr, string argstr>
   let Inst{6-0} = OPC_CUSTOM_0.Value;
 }
 
+// MIPS Custom Barrier Insns Format.
+let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in
+class MIPSExtInst_ri<bits<6> shimm5, string opcodestr>
+    : RVInstIShift<0b00000, 0b001, OPC_OP_IMM, (outs), (ins), opcodestr, ""> {
+  let shamt = shimm5;
+  let rd = 0;
+  let rs1 = 0;
+}
+
 //===----------------------------------------------------------------------===//
 // MIPS extensions
 //===----------------------------------------------------------------------===//
-let Predicates = [HasVendorXMIPSCBOP] ,DecoderNamespace = "Xmipscbop" in {
+let Predicates = [HasVendorXMIPSEXECTL], DecoderNamespace = "XMIPS" in {
+  def MIPS_EHB : MIPSExtInst_ri<0b000011, "mips.ehb">;
+  def MIPS_IHB : MIPSExtInst_ri<0b000001, "mips.ihb">;
+  def MIPS_PAUSE : MIPSExtInst_ri<0b000101, "mips.pause">;
+}
+
+let Predicates = [HasVendorXMIPSCBOP], DecoderNamespace = "XMIPS" in {
   def MIPS_PREF : Mips_prefetch_ri<(outs), (ins GPR:$rs1, uimm9:$imm9, uimm5:$hint),
                                     "mips.pref", "$hint, ${imm9}(${rs1})">,
                    Sched<[]>;
@@ -146,7 +161,7 @@ let Predicates = [HasVendorXMIPSCBOP] in {
 }
 
 let Predicates = [HasVendorXMIPSCMov], hasSideEffects = 0, mayLoad = 0, mayStore = 0,
-                 DecoderNamespace = "Xmipscmov" in {
+    DecoderNamespace = "XMIPS" in {
 def MIPS_CCMOV : RVInstR4<0b11, 0b011, OPC_CUSTOM_0, (outs GPR:$rd),
                           (ins GPR:$rs1, GPR:$rs2, GPR:$rs3),
                           "mips.ccmov", "$rd, $rs2, $rs1, $rs3">,
@@ -166,7 +181,7 @@ def : Pat<(select (XLenVT GPR:$rs2), (XLenVT GPR:$rs1), (XLenVT GPR:$rs3)),
 }
 
 let Predicates = [HasVendorXMIPSLSP], hasSideEffects = 0,
-                 DecoderNamespace = "Xmipslsp" in {
+    DecoderNamespace = "XMIPS" in {
 let mayLoad = 1, mayStore = 0 in {
 def MIPS_LWP : LWPFormat<(outs GPR:$rd1, GPR:$rd2), (ins GPR:$rs1, uimm7_lsb00:$imm7),
                          "mips.lwp", "$rd1, $rd2, ${imm7}(${rs1})">,
@@ -184,4 +199,4 @@ def MIPS_SDP : SDPFormat<(outs), (ins GPR:$rs2, GPR:$rs3, GPR:$rs1, uimm7_lsb000
                          "mips.sdp", "$rs2, $rs3, ${imm7}(${rs1})">,
                Sched<[WriteSTD, ReadStoreData, ReadStoreData, ReadMemBase]>;
 } // mayLoad = 0, mayStore = 1
-} // Predicates = [HasVendorXMIPSLSP], hasSideEffects = 0, DecoderNamespace = "Xmipslsp"
+} // Predicates = [HasVendorXMIPSLSP], hasSideEffects = 0, DecoderNamespace = "XMIPS"
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index 2c64b0c220fb..69796a68ecd6 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -22,6 +22,15 @@ def SDT_SetMultiple : SDTypeProfile<0, 4, [SDTCisSameAs<0, 1>,
 def qc_setwmi : RVSDNode<"QC_SETWMI", SDT_SetMultiple,
                          [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 
+def qc_insb : RVSDNode<"QC_INSB", SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>,
+                                                       SDTCisSameAs<0, 2>,
+                                                       SDTCisVT<0, i32>,
+                                                       SDTCisInt<3>,
+                                                       SDTCisInt<4>]>,
+                       []>;
+
+def qc_e_li : RVSDNode<"QC_E_LI", SDTIntUnaryOp>;
+
 def uimm5nonzero : RISCVOp<XLenVT>,
                    ImmLeaf<XLenVT, [{return (Imm != 0) && isUInt<5>(Imm);}]> {
   let ParserMatchClass = UImmAsmOperand<5, "NonZero">;
@@ -1508,6 +1517,11 @@ def : Pat<(i32 (and GPRNoX0:$rs, 1023)), (QC_EXTU GPRNoX0:$rs, 10, 0)>;
 def : Pat<(i32 (and GPRNoX0:$rs, 2047)), (QC_EXTU GPRNoX0:$rs, 11, 0)>;
 
 def : Pat<(i32 (bitreverse GPRNoX0:$rs1)), (QC_BREV32 GPRNoX0:$rs1)>;
+
+def : Pat<(qc_insb GPRNoX0:$rd, simm5:$imm5, uimm5_plus1:$width, uimm5:$shamt),
+          (QC_INSBI GPRNoX0:$rd, simm5:$imm5, uimm5_plus1:$width, uimm5:$shamt)>;
+def : Pat<(qc_insb GPRNoX0:$rd, GPR:$rs1, uimm5_plus1:$width, uimm5:$shamt),
+          (QC_INSB GPRNoX0:$rd, GPR:$rs1, uimm5_plus1:$width, uimm5:$shamt)>;
 } // Predicates = [HasVendorXqcibm, IsRV32]
 
 // If Zbb is enabled sext.b/h is preferred since they are compressible
@@ -1605,6 +1619,13 @@ def : Pat<(qc_setwmi GPR:$rs3, GPR:$rs1, tuimm5nonzero:$uimm5, tuimm7_lsb00:$uim
           (QC_SETWMI GPR:$rs3, GPR:$rs1, tuimm5nonzero:$uimm5, tuimm7_lsb00:$uimm7)>;
 } // Predicates = [HasVendorXqcilsm, IsRV32]
 
+let Predicates = [HasVendorXqcili, IsRV32] in {
+def: Pat<(qc_e_li tglobaladdr:$A), (QC_E_LI bare_simm32:$A)>;
+def: Pat<(qc_e_li tblockaddress:$A), (QC_E_LI bare_simm32:$A)>;
+def: Pat<(qc_e_li tjumptable:$A), (QC_E_LI bare_simm32:$A)>;
+def: Pat<(qc_e_li tconstpool:$A), (QC_E_LI bare_simm32:$A)>;
+} // Predicates = [HasVendorXqcili, IsRV32]
+
 //===----------------------------------------------------------------------===/i
 // Compress Instruction tablegen backend.
 //===----------------------------------------------------------------------===//
@@ -1738,10 +1759,19 @@ def : CompressPat<(QC_E_XORAI GPRNoX0:$rd, simm12:$imm),
                   (XORI GPRNoX0:$rd, GPRNoX0:$rd, simm12:$imm)>;
 } // let isCompressOnly = true, Predicates = [HasVendorXqcilia, IsRV32]
 
-let Predicates = [HasVendorXqciac, IsRV32] in {
+let isCompressOnly = true, Predicates = [HasVendorXqciac, IsRV32] in {
 def : CompressPat<(QC_MULIADD GPRC:$rd, GPRC:$rs1, uimm5:$imm5),
                   (QC_C_MULIADD GPRC:$rd, GPRC:$rs1, uimm5:$imm5)>;
-}
+} // isCompressOnly = true, Predicates = [HasVendorXqciac, IsRV32]
+
+let isCompressOnly = true, Predicates = [HasVendorXqciac, HasStdExtZba, IsRV32] in {
+def : CompressPat<(SH1ADD GPRC:$rd, GPRC:$rs1, GPRC:$rd),
+                  (QC_C_MULIADD GPRC:$rd, GPRC:$rs1, 2)>;
+def : CompressPat<(SH2ADD GPRC:$rd, GPRC:$rs1, GPRC:$rd),
+                  (QC_C_MULIADD GPRC:$rd, GPRC:$rs1, 4)>;
+def : CompressPat<(SH3ADD GPRC:$rd, GPRC:$rs1, GPRC:$rd),
+                  (QC_C_MULIADD GPRC:$rd, GPRC:$rs1, 8)>;
+} // isCompressOnly = true, Predicates = [HasVendorXqciac, HasStdExtZba, IsRV32]
 
 let isCompressOnly = true, Predicates = [HasVendorXqcibi, IsRV32] in {
 def : CompressPat<(QC_E_BEQI GPRNoX0:$rs1, simm5nonzero:$imm5, bare_simm13_lsb0:$imm12),
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXwch.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXwch.td
index a43cbadf6f30..bb1862cc88d6 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXwch.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXwch.td
@@ -106,6 +106,7 @@ def QK_C_LBUSP : QKStackInst<0b00, (outs GPRC:$rd_rs2),
                              (ins SPMem:$rs1, uimm4:$imm),
                              "qk.c.lbusp", "$rd_rs2, ${imm}(${rs1})">,
                  Sched<[WriteLDB, ReadMemBase]> {
+  bits<0> rs1;
   bits<4> imm;
   let Inst{10-7} = imm;
 }
@@ -115,6 +116,7 @@ def QK_C_SBSP : QKStackInst<0b10, (outs),
                                  uimm4:$imm),
                             "qk.c.sbsp", "$rd_rs2, ${imm}(${rs1})">,
                 Sched<[WriteSTB, ReadStoreData, ReadMemBase]> {
+  bits<0> rs1;
   bits<4> imm;
   let Inst{10-7} = imm;
 }
@@ -124,6 +126,7 @@ def QK_C_LHUSP : QKStackInst<0b01, (outs GPRC:$rd_rs2),
                              (ins SPMem:$rs1, uimm5_lsb0:$imm),
                              "qk.c.lhusp", "$rd_rs2, ${imm}(${rs1})">,
                  Sched<[WriteLDH, ReadMemBase]> {
+  bits<0> rs1;
   bits<5> imm;
   let Inst{10-8} = imm{3-1};
   let Inst{7} = imm{4};
@@ -133,6 +136,7 @@ def QK_C_SHSP : QKStackInst<0b11, (outs),
                             (ins GPRC:$rd_rs2, SPMem:$rs1, uimm5_lsb0:$imm),
                             "qk.c.shsp", "$rd_rs2, ${imm}(${rs1})">,
                 Sched<[WriteSTH, ReadStoreData, ReadMemBase]> {
+  bits<0> rs1;
   bits<5> imm;
   let Inst{10-8} = imm{3-1};
   let Inst{7} = imm{4};
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index 2abd3e613a03..a2b4302e19ed 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -459,15 +459,15 @@ let Predicates = [HasStdExtZba, IsRV64] in {
 def : InstAlias<"zext.w $rd, $rs", (ADD_UW GPR:$rd, GPR:$rs, X0)>;
 } // Predicates = [HasStdExtZba, IsRV64]
 
-let Predicates = [HasStdExtZbb] in {
+let Predicates = [HasStdExtZbbOrZbkb] in {
 def : InstAlias<"ror $rd, $rs1, $shamt",
-                (RORI  GPR:$rd, GPR:$rs1, uimmlog2xlen:$shamt), 0>;
-} // Predicates = [HasStdExtZbb]
+                (RORI GPR:$rd, GPR:$rs1, uimmlog2xlen:$shamt), 0>;
+} // Predicates = [HasStdExtZbbOrZbkb]
 
-let Predicates = [HasStdExtZbb, IsRV64] in {
+let Predicates = [HasStdExtZbbOrZbkb, IsRV64] in {
 def : InstAlias<"rorw $rd, $rs1, $shamt",
-                (RORIW  GPR:$rd, GPR:$rs1, uimm5:$shamt), 0>;
-} // Predicates = [HasStdExtZbb, IsRV64]
+                (RORIW GPR:$rd, GPR:$rs1, uimm5:$shamt), 0>;
+} // Predicates = [HasStdExtZbbOrZbkb, IsRV64]
 
 let Predicates = [HasStdExtZbs] in {
 def : InstAlias<"bset $rd, $rs1, $shamt",
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZcmop.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZcmop.td
index 32e7f962aa2a..76dc027ffd1d 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZcmop.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZcmop.td
@@ -22,5 +22,5 @@ class CMOPInst<bits<3> imm3, string opcodestr>
 
 foreach n = [1, 3, 5, 7, 9, 11, 13, 15] in {
   let Predicates = [HasStdExtZcmop] in
-  def C_MOP # n : CMOPInst<!srl(n, 1), "c.mop." # n>, Sched<[]>;
+  def C_MOP_ # n : CMOPInst<!srl(n, 1), "c.mop." # n>, Sched<[]>;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZicfiss.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZicfiss.td
index 49a57f86cccd..50ebaa995197 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZicfiss.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZicfiss.td
@@ -62,6 +62,21 @@ defm SSAMOSWAP_W  : AMO_rr_aq_rl<0b01001, 0b010, "ssamoswap.w">;
 let Predicates = [HasStdExtZicfiss, IsRV64] in
 defm SSAMOSWAP_D  : AMO_rr_aq_rl<0b01001, 0b011, "ssamoswap.d">;
 
+let Predicates = [HasStdExtZimop] in {
+let hasSideEffects = 1, mayLoad = 0, mayStore = 1 in
+def PseudoMOP_SSPUSH : Pseudo<(outs), (ins GPRX1X5:$rs2), []>,
+    PseudoInstExpansion<(MOP_RR_7 X0, X0, GPR:$rs2)>;
+let hasSideEffects = 1, mayLoad = 1, mayStore = 0 in
+def PseudoMOP_SSPOPCHK : Pseudo<(outs), (ins GPRX1X5:$rs1), []>,
+    PseudoInstExpansion<(MOP_R_28 X0, GPR:$rs1)>;
+} // Predicates = [HasStdExtZimop]
+
+let Predicates = [HasStdExtZcmop] in {
+let Uses = [X1], hasSideEffects = 1, mayLoad = 0, mayStore = 1 in
+def PseudoMOP_C_SSPUSH : Pseudo<(outs), (ins), []>,
+    PseudoInstExpansion<(C_MOP_1)>;
+} // Predicates = [HasStdExtZcmop]
+
 //===----------------------------------------------------------------------===/
 // Compress Instruction tablegen backend.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZimop.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZimop.td
index 960f5669b488..0d08176f9799 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZimop.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZimop.td
@@ -33,13 +33,13 @@ class RVInstRMoprr<bits<4> imm4, bits<3> imm3, bits<3> funct3, RISCVOpcode opcod
 }
 
 // May-Be-Operations
-def riscv_mopr  : RVSDNode<"MOPR",
-                           SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
-                                                SDTCisSameAs<0, 2>]>>;
-def riscv_moprr : RVSDNode<"MOPRR",
-                           SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
-                                                SDTCisSameAs<0, 2>,
-                                                SDTCisSameAs<0, 3>]>>;
+def riscv_mop_r  : RVSDNode<"MOP_R",
+                            SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
+                                                 SDTCisSameAs<0, 2>]>>;
+def riscv_mop_rr : RVSDNode<"MOP_RR",
+                            SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
+                                                 SDTCisSameAs<0, 2>,
+                                                 SDTCisSameAs<0, 3>]>>;
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 class RVMopr<bits<7> imm7, bits<5> imm5, bits<3> funct3,
@@ -50,31 +50,32 @@ class RVMopr<bits<7> imm7, bits<5> imm5, bits<3> funct3,
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 class RVMoprr<bits<4> imm4, bits<3> imm3, bits<3> funct3,
              RISCVOpcode opcode, string opcodestr>
-    : RVInstRMoprr<imm4, imm3, funct3, opcode, (outs GPR:$rd), (ins GPR:$rs1, GPR:$rs2),
+    : RVInstRMoprr<imm4, imm3, funct3, opcode, (outs GPR:$rd),
+                   (ins GPR:$rs1, GPR:$rs2),
                    opcodestr, "$rd, $rs1, $rs2">;
 
 foreach i = 0...31 in {
   let Predicates = [HasStdExtZimop] in
-  def MOPR#i : RVMopr<0b1000111, i, 0b100, OPC_SYSTEM, "mop.r."#i>,
-               Sched<[]>;
+  def MOP_R_#i : RVMopr<0b1000111, i, 0b100, OPC_SYSTEM, "mop.r."#i>,
+                 Sched<[]>;
 }
 
 foreach i = 0...7 in {
   let Predicates = [HasStdExtZimop] in
-  def MOPRR#i : RVMoprr<0b1001, i, 0b100, OPC_SYSTEM, "mop.rr."#i>,
+  def MOP_RR_#i : RVMoprr<0b1001, i, 0b100, OPC_SYSTEM, "mop.rr."#i>,
                 Sched<[]>;
 }
 
 let Predicates = [HasStdExtZimop] in {
 // Zimop instructions
 foreach i = 0...31 in {
-  def : Pat<(XLenVT (riscv_mopr GPR:$rs1, (XLenVT i))),
-            (!cast<Instruction>("MOPR"#i) GPR:$rs1)>;
+  def : Pat<(XLenVT (riscv_mop_r GPR:$rs1, (XLenVT i))),
+            (!cast<Instruction>("MOP_R_"#i) GPR:$rs1)>;
 }
 
 foreach i = 0...7 in {
-  def : Pat<(XLenVT (riscv_moprr GPR:$rs1, GPR:$rs2, (XLenVT i))),
-            (!cast<Instruction>("MOPRR"#i) GPR:$rs1, GPR:$rs2)>;
+  def : Pat<(XLenVT (riscv_mop_rr GPR:$rs1, GPR:$rs2, (XLenVT i))),
+            (!cast<Instruction>("MOP_RR_"#i) GPR:$rs1, GPR:$rs2)>;
 }
 
 } // Predicates = [HasStdExtZimop]
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvqdotq.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvqdotq.td
index 27959eaccd90..00c4e83e18a0 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvqdotq.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvqdotq.td
@@ -17,16 +17,39 @@
 // Instructions
 //===----------------------------------------------------------------------===//
 
+class VQDOTVV<bits<6> funct6, RISCVVFormat opv, string opcodestr>
+    : RVInstVV<funct6, opv, (outs VR:$vd_wb),
+                (ins VR:$vd, VR:$vs2, VR:$vs1, VMaskOp:$vm),
+                opcodestr, "$vd, $vs2, $vs1$vm"> {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let Constraints = "$vd = $vd_wb";
+}
+
+class VQDOTVX<bits<6> funct6, RISCVVFormat opv, string opcodestr>
+    : RVInstVX<funct6, opv, (outs VR:$vd_wb),
+                (ins VR:$vd, VR:$vs2, GPR:$rs1, VMaskOp:$vm),
+                opcodestr, "$vd, $vs2, $rs1$vm"> {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let Constraints = "$vd = $vd_wb";
+}
+
 let Predicates = [HasStdExtZvqdotq] in {
-  def VQDOT_VV   : VALUVV<0b101100, OPMVV, "vqdot.vv">;
-  def VQDOT_VX   : VALUVX<0b101100, OPMVX, "vqdot.vx">;
-  def VQDOTU_VV  : VALUVV<0b101000, OPMVV, "vqdotu.vv">;
-  def VQDOTU_VX  : VALUVX<0b101000, OPMVX, "vqdotu.vx">;
-  def VQDOTSU_VV : VALUVV<0b101010, OPMVV, "vqdotsu.vv">;
-  def VQDOTSU_VX : VALUVX<0b101010, OPMVX, "vqdotsu.vx">;
-  def VQDOTUS_VX : VALUVX<0b101110, OPMVX, "vqdotus.vx">;
+  def VQDOT_VV   : VQDOTVV<0b101100, OPMVV, "vqdot.vv">;
+  def VQDOT_VX   : VQDOTVX<0b101100, OPMVX, "vqdot.vx">;
+  def VQDOTU_VV  : VQDOTVV<0b101000, OPMVV, "vqdotu.vv">;
+  def VQDOTU_VX  : VQDOTVX<0b101000, OPMVX, "vqdotu.vx">;
+  def VQDOTSU_VV : VQDOTVV<0b101010, OPMVV, "vqdotsu.vv">;
+  def VQDOTSU_VX : VQDOTVX<0b101010, OPMVX, "vqdotsu.vx">;
+  def VQDOTUS_VX : VQDOTVX<0b101110, OPMVX, "vqdotus.vx">;
 } // Predicates = [HasStdExtZvqdotq]
 
+//===----------------------------------------------------------------------===//
+// Helpers to define the VL patterns.
+//===----------------------------------------------------------------------===//
 
 let HasPassthruOp = true, HasMaskOp = true in {
   def riscv_vqdot_vl : RVSDNode<"VQDOT_VL", SDT_RISCVIntBinOp_VL>;
@@ -34,6 +57,10 @@ let HasPassthruOp = true, HasMaskOp = true in {
   def riscv_vqdotsu_vl : RVSDNode<"VQDOTSU_VL", SDT_RISCVIntBinOp_VL>;
 } // let HasPassthruOp = true, HasMaskOp = true
 
+//===----------------------------------------------------------------------===//
+// Pseudo Instructions for CodeGen
+//===----------------------------------------------------------------------===//
+
 multiclass VPseudoVQDOT_VV_VX {
   foreach m = MxSet<32>.m in {
     defm "" : VPseudoBinaryV_VV<m>,
@@ -52,10 +79,69 @@ let Predicates = [HasStdExtZvqdotq], mayLoad = 0, mayStore = 0,
   defm PseudoVQDOT : VPseudoVQDOT_VV_VX;
   defm PseudoVQDOTU : VPseudoVQDOT_VV_VX;
   defm PseudoVQDOTSU : VPseudoVQDOT_VV_VX;
+  // VQDOTUS does not have a VV variant
+  foreach m = MxListVF4 in {
+    defm "PseudoVQDOTUS_VX" : VPseudoTernaryWithPolicy<m.vrclass, m.vrclass, GPR, m>;
+  }
 }
 
+//===----------------------------------------------------------------------===//
+// Patterns.
+//===----------------------------------------------------------------------===//
+
 defvar AllE32Vectors = [VI32MF2, VI32M1, VI32M2, VI32M4, VI32M8];
 defm : VPatBinaryVL_VV_VX<riscv_vqdot_vl, "PseudoVQDOT", AllE32Vectors>;
 defm : VPatBinaryVL_VV_VX<riscv_vqdotu_vl, "PseudoVQDOTU", AllE32Vectors>;
 defm : VPatBinaryVL_VV_VX<riscv_vqdotsu_vl, "PseudoVQDOTSU", AllE32Vectors>;
 
+// These VPat definitions are for vqdot because they have a different operand
+// order with other ternary instructions (i.e. vop.vx vd, vs2, rs1)
+multiclass VPatTernaryV_VX_AABX<string intrinsic, string instruction,
+                                list<VTypeInfoToWide> info_pairs> {
+  foreach pair = info_pairs in {
+    defvar VdInfo = pair.Wti;
+    defvar Vs2Info = pair.Vti;
+    let Predicates = GetVTypePredicates<VdInfo>.Predicates in
+    defm : VPatTernaryWithPolicy<intrinsic, instruction,
+                                 "V"#VdInfo.ScalarSuffix,
+                                 VdInfo.Vector, Vs2Info.Vector, Vs2Info.Scalar,
+                                 VdInfo.Mask, VdInfo.Log2SEW, VdInfo.LMul,
+                                 VdInfo.RegClass, Vs2Info.RegClass,
+                                 Vs2Info.ScalarRegClass>;
+  }
+}
+
+multiclass VPatTernaryV_VV_AABX<string intrinsic, string instruction,
+                                list<VTypeInfoToWide> info_pairs> {
+  foreach pair = info_pairs in {
+    defvar VdInfo = pair.Wti;
+    defvar Vs2Info = pair.Vti;
+    let Predicates = GetVTypePredicates<VdInfo>.Predicates in
+    defm : VPatTernaryWithPolicy<intrinsic, instruction,
+                                 "VV",
+                                 VdInfo.Vector, Vs2Info.Vector, Vs2Info.Vector,
+                                 VdInfo.Mask, VdInfo.Log2SEW, VdInfo.LMul,
+                                 VdInfo.RegClass, Vs2Info.RegClass,
+                                 Vs2Info.RegClass>;
+  }
+}
+
+multiclass VPatTernaryV_VV_VX_AABX<string intrinsic, string instruction,
+                                   list<VTypeInfoToWide> info_pairs>
+    : VPatTernaryV_VV_AABX<intrinsic, instruction, info_pairs>,
+      VPatTernaryV_VX_AABX<intrinsic, instruction, info_pairs>;
+
+defset list<VTypeInfoToWide> VQDOTInfoPairs = {
+  def : VTypeInfoToWide<VI8MF2, VI32MF2>;
+  def : VTypeInfoToWide<VI8M1, VI32M1>;
+  def : VTypeInfoToWide<VI8M2, VI32M2>;
+  def : VTypeInfoToWide<VI8M4, VI32M4>;
+  def : VTypeInfoToWide<VI8M8, VI32M8>;
+}
+
+let Predicates = [HasStdExtZvqdotq] in {
+  defm : VPatTernaryV_VV_VX_AABX<"int_riscv_vqdot", "PseudoVQDOT", VQDOTInfoPairs>;
+  defm : VPatTernaryV_VV_VX_AABX<"int_riscv_vqdotu", "PseudoVQDOTU", VQDOTInfoPairs>;
+  defm : VPatTernaryV_VV_VX_AABX<"int_riscv_vqdotsu", "PseudoVQDOTSU", VQDOTInfoPairs>;
+  defm : VPatTernaryV_VX_AABX<"int_riscv_vqdotus", "PseudoVQDOTUS", VQDOTInfoPairs>;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
index 4abe62f4e874..06309262f1b0 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
@@ -148,6 +148,14 @@ def isNonZeroLoadImmediate
                      CheckNot<CheckImmOperand<2, 0>>
                    ]>>>;
 
+def isLPAD
+    : TIIPredicate<"isLPAD",
+                   MCReturnStatement<CheckAll<[
+                     CheckOpcode<[AUIPC]>,
+                     CheckIsRegOperand<0>,
+                     CheckRegOperand<0, X0>,
+                   ]>>>;
+
 def ignoresVXRM
     : TIIPredicate<"ignoresVXRM",
                    MCOpcodeSwitchStatement<
diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
index c7b96f5c3d0c..5e1063155ba0 100644
--- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
@@ -81,6 +81,12 @@ static const Intrinsic::ID FixedVssegIntrIds[] = {
     Intrinsic::riscv_seg6_store_mask, Intrinsic::riscv_seg7_store_mask,
     Intrinsic::riscv_seg8_store_mask};
 
+static const Intrinsic::ID FixedVsssegIntrIds[] = {
+    Intrinsic::riscv_sseg2_store_mask, Intrinsic::riscv_sseg3_store_mask,
+    Intrinsic::riscv_sseg4_store_mask, Intrinsic::riscv_sseg5_store_mask,
+    Intrinsic::riscv_sseg6_store_mask, Intrinsic::riscv_sseg7_store_mask,
+    Intrinsic::riscv_sseg8_store_mask};
+
 static const Intrinsic::ID ScalableVssegIntrIds[] = {
     Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask,
     Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask,
@@ -275,7 +281,16 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
 bool RISCVTargetLowering::lowerInterleavedStore(Instruction *Store,
                                                 Value *LaneMask,
                                                 ShuffleVectorInst *SVI,
-                                                unsigned Factor) const {
+                                                unsigned Factor,
+                                                const APInt &GapMask) const {
+  assert(GapMask.getBitWidth() == Factor);
+
+  // We only support cases where the skipped fields are the trailing ones.
+  // TODO: Lower to strided store if there is only a single active field.
+  unsigned MaskFactor = GapMask.popcount();
+  if (MaskFactor < 2 || !GapMask.isMask())
+    return false;
+
   IRBuilder<> Builder(Store);
   const DataLayout &DL = Store->getDataLayout();
   auto Mask = SVI->getShuffleMask();
@@ -287,21 +302,31 @@ bool RISCVTargetLowering::lowerInterleavedStore(Instruction *Store,
 
   Value *Ptr, *VL;
   Align Alignment;
-  if (!getMemOperands(Factor, VTy, XLenTy, Store, Ptr, LaneMask, VL, Alignment))
+  if (!getMemOperands(MaskFactor, VTy, XLenTy, Store, Ptr, LaneMask, VL,
+                      Alignment))
     return false;
 
   Type *PtrTy = Ptr->getType();
   unsigned AS = PtrTy->getPointerAddressSpace();
-  if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL))
+  if (!isLegalInterleavedAccessType(VTy, MaskFactor, Alignment, AS, DL))
     return false;
 
-  Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
-      Store->getModule(), FixedVssegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy});
+  Function *SegStoreFunc;
+  if (MaskFactor < Factor)
+    // Strided segmented store.
+    SegStoreFunc = Intrinsic::getOrInsertDeclaration(
+        Store->getModule(), FixedVsssegIntrIds[MaskFactor - 2],
+        {VTy, PtrTy, XLenTy, XLenTy});
+  else
+    // Normal segmented store.
+    SegStoreFunc = Intrinsic::getOrInsertDeclaration(
+        Store->getModule(), FixedVssegIntrIds[Factor - 2],
+        {VTy, PtrTy, XLenTy});
 
   SmallVector<Value *, 10> Ops;
   SmallVector<int, 16> NewShuffleMask;
 
-  for (unsigned i = 0; i < Factor; i++) {
+  for (unsigned i = 0; i < MaskFactor; i++) {
     // Collect shuffle mask for this lane.
     for (unsigned j = 0; j < VTy->getNumElements(); j++)
       NewShuffleMask.push_back(Mask[i + Factor * j]);
@@ -312,8 +337,14 @@ bool RISCVTargetLowering::lowerInterleavedStore(Instruction *Store,
 
     NewShuffleMask.clear();
   }
-  Ops.append({Ptr, LaneMask, VL});
-  Builder.CreateCall(VssegNFunc, Ops);
+  Ops.push_back(Ptr);
+  if (MaskFactor < Factor) {
+    // Insert the stride argument.
+    unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType());
+    Ops.push_back(ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes));
+  }
+  Ops.append({LaneMask, VL});
+  Builder.CreateCall(SegStoreFunc, Ops);
 
   return true;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
index 3b19c3456ad6..d08115b72977 100644
--- a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
+++ b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
@@ -356,6 +356,14 @@ static bool hasAllNBitUsers(const MachineInstr &OrigMI,
           return false;
         Worklist.emplace_back(UserMI, Bits);
         break;
+      case RISCV::TH_EXT:
+      case RISCV::TH_EXTU:
+        unsigned Msb = UserMI->getOperand(2).getImm();
+        unsigned Lsb = UserMI->getOperand(3).getImm();
+        // Behavior of Msb < Lsb is not well documented.
+        if (Msb >= Lsb && Bits > Msb)
+          break;
+        return false;
       }
     }
   }
@@ -409,6 +417,16 @@ static bool isSignExtendingOpW(const MachineInstr &MI, unsigned OpNo) {
     assert(Log2SEW >= 3 && Log2SEW <= 6 && "Unexpected Log2SEW");
     return Log2SEW <= 5;
   }
+  case RISCV::TH_EXT: {
+    unsigned Msb = MI.getOperand(2).getImm();
+    unsigned Lsb = MI.getOperand(3).getImm();
+    return Msb >= Lsb && (Msb - Lsb + 1) <= 32;
+  }
+  case RISCV::TH_EXTU: {
+    unsigned Msb = MI.getOperand(2).getImm();
+    unsigned Lsb = MI.getOperand(3).getImm();
+    return Msb >= Lsb && (Msb - Lsb + 1) < 32;
+  }
   }
 
   return false;
@@ -519,9 +537,11 @@ static bool isSignExtendedW(Register SrcReg, const RISCVSubtarget &ST,
     case RISCV::ANDI:
     case RISCV::ORI:
     case RISCV::XORI:
+    case RISCV::SRAI:
       // |Remainder| is always <= |Dividend|. If D is 32-bit, then so is R.
       // DIV doesn't work because of the edge case 0xf..f 8000 0000 / (long)-1
       // Logical operations use a sign extended 12-bit immediate.
+      // Arithmetic shift right can only increase the number of sign bits.
       if (!AddRegToWorkList(MI->getOperand(1).getReg()))
         return false;
 
@@ -556,6 +576,9 @@ static bool isSignExtendedW(Register SrcReg, const RISCVSubtarget &ST,
     case RISCV::PseudoCCAND:
     case RISCV::PseudoCCOR:
     case RISCV::PseudoCCXOR:
+    case RISCV::PseudoCCANDN:
+    case RISCV::PseudoCCORN:
+    case RISCV::PseudoCCXNOR:
     case RISCV::PHI: {
       // If all incoming values are sign-extended, the output of AND, OR, XOR,
       // MIN, MAX, or PHI is also sign-extended.
@@ -578,6 +601,9 @@ static bool isSignExtendedW(Register SrcReg, const RISCVSubtarget &ST,
       case RISCV::PseudoCCAND:
       case RISCV::PseudoCCOR:
       case RISCV::PseudoCCXOR:
+      case RISCV::PseudoCCANDN:
+      case RISCV::PseudoCCORN:
+      case RISCV::PseudoCCXNOR:
         B = 4;
         E = 7;
         break;
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index f89d94f41b69..36d63ed23b92 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -121,7 +121,8 @@ def MIPS_P8700 : RISCVProcessorModel<"mips-p8700",
                                       FeatureStdExtZicsr,
                                       FeatureVendorXMIPSCMov,
                                       FeatureVendorXMIPSLSP,
-                                      FeatureVendorXMIPSCBOP],
+                                      FeatureVendorXMIPSCBOP,
+                                      FeatureVendorXMIPSEXECTL],
                                      [TuneMIPSP8700]>;
 
 def ROCKET_RV32 : RISCVProcessorModel<"rocket-rv32",
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index f3966a55ce7d..40b641680b2c 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -966,7 +966,9 @@ bool RISCVRegisterInfo::getRegAllocationHints(
       }
     }
 
-    // Add a hint if it would allow auipc/lui+addi(w) fusion.
+    // Add a hint if it would allow auipc/lui+addi(w) fusion.  We do this even
+    // without the fusions explicitly enabled as the impact is rarely negative
+    // and some cores do implement this fusion.
     if ((MI.getOpcode() == RISCV::ADDIW || MI.getOpcode() == RISCV::ADDI) &&
         MI.getOperand(1).isReg()) {
       const MachineBasicBlock &MBB = *MI.getParent();
@@ -974,9 +976,7 @@ bool RISCVRegisterInfo::getRegAllocationHints(
       // Is the previous instruction a LUI or AUIPC that can be fused?
       if (I != MBB.begin()) {
         I = skipDebugInstructionsBackward(std::prev(I), MBB.begin());
-        if (((I->getOpcode() == RISCV::LUI && Subtarget.hasLUIADDIFusion()) ||
-             (I->getOpcode() == RISCV::AUIPC &&
-              Subtarget.hasAUIPCADDIFusion())) &&
+        if ((I->getOpcode() == RISCV::LUI || I->getOpcode() == RISCV::AUIPC) &&
             I->getOperand(0).getReg() == MI.getOperand(1).getReg()) {
           if (OpIdx == 0)
             tryAddHint(MO, MI.getOperand(1), /*NeedGPRC=*/false);
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index fd57e02c25d0..50e76df56e57 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -186,6 +186,12 @@ public:
     return HasStdExtZfhmin || HasStdExtZfbfmin;
   }
 
+  bool hasBEXTILike() const { return HasStdExtZbs || HasVendorXTHeadBs; }
+
+  bool hasCZEROLike() const {
+    return HasStdExtZicond || HasVendorXVentanaCondOps;
+  }
+
   bool hasConditionalMoveFusion() const {
     // Do we support fusing a branch+mv or branch+c.mv as a conditional move.
     return (hasConditionalCompressedMoveFusion() && hasStdExtZca()) ||
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index d70b1d0dc8d5..460bb33f2553 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -652,7 +652,8 @@ void RISCVPassConfig::addPostRegAlloc() {
 void RISCVTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
   PB.registerLateLoopOptimizationsEPCallback([=](LoopPassManager &LPM,
                                                  OptimizationLevel Level) {
-    LPM.addPass(LoopIdiomVectorizePass(LoopIdiomVectorizeStyle::Predicated));
+    if (Level != OptimizationLevel::O0)
+      LPM.addPass(LoopIdiomVectorizePass(LoopIdiomVectorizeStyle::Predicated));
   });
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index c707fb110b10..1ca513214f67 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1566,6 +1566,18 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   return BaseT::getIntrinsicInstrCost(ICA, CostKind);
 }
 
+InstructionCost
+RISCVTTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
+                                        const SCEV *Ptr,
+                                        TTI::TargetCostKind CostKind) const {
+  // Address computations for vector indexed load/store likely require an offset
+  // and/or scaling.
+  if (ST->hasVInstructions() && PtrTy->isVectorTy())
+    return getArithmeticInstrCost(Instruction::Add, PtrTy, CostKind);
+
+  return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
+}
+
 InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
                                                Type *Src,
                                                TTI::CastContextHint CCH,
@@ -2731,6 +2743,10 @@ unsigned RISCVTTIImpl::getMinTripCountTailFoldingThreshold() const {
   return RVVMinTripCount;
 }
 
+bool RISCVTTIImpl::preferAlternateOpcodeVectorization() const {
+  return ST->enableUnalignedVectorMem();
+}
+
 TTI::AddressingModeKind
 RISCVTTIImpl::getPreferredAddressingMode(const Loop *L,
                                          ScalarEvolution *SE) const {
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 3236b2a35c85..6bd7d51daff6 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -132,7 +132,7 @@ public:
 
   unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override;
 
-  bool preferAlternateOpcodeVectorization() const override { return false; }
+  bool preferAlternateOpcodeVectorization() const override;
 
   bool preferEpilogueVectorization() const override {
     // Epilogue vectorization is usually unprofitable - tail folding or
@@ -177,6 +177,10 @@ public:
   getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                         TTI::TargetCostKind CostKind) const override;
 
+  InstructionCost
+  getAddressComputationCost(Type *PTy, ScalarEvolution *SE, const SCEV *Ptr,
+                            TTI::TargetCostKind CostKind) const override;
+
   InstructionCost getInterleavedMemoryOpCost(
       unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
       Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index 53557049ea33..29526cf5a527 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -178,8 +178,20 @@ static unsigned getIntegerExtensionOperandEEW(unsigned Factor,
   return Log2EEW;
 }
 
-static std::optional<unsigned>
-getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
+#define VSEG_CASES(Prefix, EEW)                                                \
+  RISCV::Prefix##SEG2E##EEW##_V:                                               \
+  case RISCV::Prefix##SEG3E##EEW##_V:                                          \
+  case RISCV::Prefix##SEG4E##EEW##_V:                                          \
+  case RISCV::Prefix##SEG5E##EEW##_V:                                          \
+  case RISCV::Prefix##SEG6E##EEW##_V:                                          \
+  case RISCV::Prefix##SEG7E##EEW##_V:                                          \
+  case RISCV::Prefix##SEG8E##EEW##_V
+#define VSSEG_CASES(EEW)    VSEG_CASES(VS, EEW)
+#define VSSSEG_CASES(EEW)   VSEG_CASES(VSS, EEW)
+#define VSUXSEG_CASES(EEW)  VSEG_CASES(VSUX, I##EEW)
+#define VSOXSEG_CASES(EEW)  VSEG_CASES(VSOX, I##EEW)
+
+static std::optional<unsigned> getOperandLog2EEW(const MachineOperand &MO) {
   const MachineInstr &MI = *MO.getParent();
   const MCInstrDesc &Desc = MI.getDesc();
   const RISCVVPseudosTable::PseudoInfo *RVV =
@@ -225,21 +237,29 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
   case RISCV::VSE8_V:
   case RISCV::VLSE8_V:
   case RISCV::VSSE8_V:
+  case VSSEG_CASES(8):
+  case VSSSEG_CASES(8):
     return 3;
   case RISCV::VLE16_V:
   case RISCV::VSE16_V:
   case RISCV::VLSE16_V:
   case RISCV::VSSE16_V:
+  case VSSEG_CASES(16):
+  case VSSSEG_CASES(16):
     return 4;
   case RISCV::VLE32_V:
   case RISCV::VSE32_V:
   case RISCV::VLSE32_V:
   case RISCV::VSSE32_V:
+  case VSSEG_CASES(32):
+  case VSSSEG_CASES(32):
     return 5;
   case RISCV::VLE64_V:
   case RISCV::VSE64_V:
   case RISCV::VLSE64_V:
   case RISCV::VSSE64_V:
+  case VSSEG_CASES(64):
+  case VSSSEG_CASES(64):
     return 6;
 
   // Vector Indexed Instructions
@@ -248,7 +268,9 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
   case RISCV::VLUXEI8_V:
   case RISCV::VLOXEI8_V:
   case RISCV::VSUXEI8_V:
-  case RISCV::VSOXEI8_V: {
+  case RISCV::VSOXEI8_V:
+  case VSUXSEG_CASES(8):
+  case VSOXSEG_CASES(8): {
     if (MO.getOperandNo() == 0)
       return MILog2SEW;
     return 3;
@@ -256,7 +278,9 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
   case RISCV::VLUXEI16_V:
   case RISCV::VLOXEI16_V:
   case RISCV::VSUXEI16_V:
-  case RISCV::VSOXEI16_V: {
+  case RISCV::VSOXEI16_V:
+  case VSUXSEG_CASES(16):
+  case VSOXSEG_CASES(16): {
     if (MO.getOperandNo() == 0)
       return MILog2SEW;
     return 4;
@@ -264,7 +288,9 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
   case RISCV::VLUXEI32_V:
   case RISCV::VLOXEI32_V:
   case RISCV::VSUXEI32_V:
-  case RISCV::VSOXEI32_V: {
+  case RISCV::VSOXEI32_V:
+  case VSUXSEG_CASES(32):
+  case VSOXSEG_CASES(32): {
     if (MO.getOperandNo() == 0)
       return MILog2SEW;
     return 5;
@@ -272,7 +298,9 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
   case RISCV::VLUXEI64_V:
   case RISCV::VLOXEI64_V:
   case RISCV::VSUXEI64_V:
-  case RISCV::VSOXEI64_V: {
+  case RISCV::VSOXEI64_V:
+  case VSUXSEG_CASES(64):
+  case VSOXSEG_CASES(64): {
     if (MO.getOperandNo() == 0)
       return MILog2SEW;
     return 6;
@@ -422,9 +450,6 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
   case RISCV::VRGATHER_VI:
   case RISCV::VRGATHER_VV:
   case RISCV::VRGATHER_VX:
-  // Vector Compress Instruction
-  // EEW=SEW.
-  case RISCV::VCOMPRESS_VM:
   // Vector Element Index Instruction
   case RISCV::VID_V:
   // Vector Single-Width Floating-Point Add/Subtract Instructions
@@ -674,6 +699,12 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
     return MILog2SEW;
   }
 
+  // Vector Compress Instruction
+  // EEW=SEW, except the mask operand has EEW=1. Mask operand is not handled
+  // before this switch.
+  case RISCV::VCOMPRESS_VM:
+    return MO.getOperandNo() == 3 ? 0 : MILog2SEW;
+
   // Vector Iota Instruction
   // EEW=SEW, except the mask operand has EEW=1. Mask operand is not handled
   // before this switch.
@@ -778,14 +809,13 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
   }
 }
 
-static std::optional<OperandInfo>
-getOperandInfo(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
+static std::optional<OperandInfo> getOperandInfo(const MachineOperand &MO) {
   const MachineInstr &MI = *MO.getParent();
   const RISCVVPseudosTable::PseudoInfo *RVV =
       RISCVVPseudosTable::getPseudoInfo(MI.getOpcode());
   assert(RVV && "Could not find MI in PseudoTable");
 
-  std::optional<unsigned> Log2EEW = getOperandLog2EEW(MO, MRI);
+  std::optional<unsigned> Log2EEW = getOperandLog2EEW(MO);
   if (!Log2EEW)
     return std::nullopt;
 
@@ -900,13 +930,6 @@ static bool isSupportedInstr(const MachineInstr &MI) {
   case RISCV::VSEXT_VF4:
   case RISCV::VZEXT_VF8:
   case RISCV::VSEXT_VF8:
-  // Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions
-  // FIXME: Add support
-  case RISCV::VMADC_VV:
-  case RISCV::VMADC_VI:
-  case RISCV::VMADC_VX:
-  case RISCV::VMSBC_VV:
-  case RISCV::VMSBC_VX:
   // Vector Narrowing Integer Right Shift Instructions
   case RISCV::VNSRL_WX:
   case RISCV::VNSRL_WI:
@@ -993,6 +1016,11 @@ static bool isSupportedInstr(const MachineInstr &MI) {
   case RISCV::VSBC_VXM:
   case RISCV::VMSBC_VVM:
   case RISCV::VMSBC_VXM:
+  case RISCV::VMADC_VV:
+  case RISCV::VMADC_VI:
+  case RISCV::VMADC_VX:
+  case RISCV::VMSBC_VV:
+  case RISCV::VMSBC_VX:
   // Vector Widening Integer Multiply-Add Instructions
   case RISCV::VWMACCU_VV:
   case RISCV::VWMACCU_VX:
@@ -1001,10 +1029,7 @@ static bool isSupportedInstr(const MachineInstr &MI) {
   case RISCV::VWMACCSU_VV:
   case RISCV::VWMACCSU_VX:
   case RISCV::VWMACCUS_VX:
-  // Vector Integer Merge Instructions
-  // FIXME: Add support
   // Vector Integer Move Instructions
-  // FIXME: Add support
   case RISCV::VMV_V_I:
   case RISCV::VMV_V_X:
   case RISCV::VMV_V_V:
@@ -1306,7 +1331,8 @@ bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const {
   // TODO: Use a better approach than a white-list, such as adding
   // properties to instructions using something like TSFlags.
   if (!isSupportedInstr(MI)) {
-    LLVM_DEBUG(dbgs() << "Not a candidate due to unsupported instruction\n");
+    LLVM_DEBUG(dbgs() << "Not a candidate due to unsupported instruction: "
+                      << MI);
     return false;
   }
 
@@ -1328,14 +1354,14 @@ RISCVVLOptimizer::getMinimumVLForUser(const MachineOperand &UserOp) const {
   const MCInstrDesc &Desc = UserMI.getDesc();
 
   if (!RISCVII::hasVLOp(Desc.TSFlags) || !RISCVII::hasSEWOp(Desc.TSFlags)) {
-    LLVM_DEBUG(dbgs() << "    Abort due to lack of VL, assume that"
+    LLVM_DEBUG(dbgs() << "  Abort due to lack of VL, assume that"
                          " use VLMAX\n");
     return std::nullopt;
   }
 
   if (RISCVII::readsPastVL(
           TII->get(RISCV::getRVVMCOpcode(UserMI.getOpcode())).TSFlags)) {
-    LLVM_DEBUG(dbgs() << "    Abort because used by unsafe instruction\n");
+    LLVM_DEBUG(dbgs() << "  Abort because used by unsafe instruction\n");
     return std::nullopt;
   }
 
@@ -1352,7 +1378,7 @@ RISCVVLOptimizer::getMinimumVLForUser(const MachineOperand &UserOp) const {
            RISCVII::isFirstDefTiedToFirstUse(UserMI.getDesc()));
     auto DemandedVL = DemandedVLs.lookup(&UserMI);
     if (!DemandedVL || !RISCV::isVLKnownLE(*DemandedVL, VLOp)) {
-      LLVM_DEBUG(dbgs() << "    Abort because user is passthru in "
+      LLVM_DEBUG(dbgs() << "  Abort because user is passthru in "
                            "instruction with demanded tail\n");
       return std::nullopt;
     }
@@ -1376,6 +1402,54 @@ RISCVVLOptimizer::getMinimumVLForUser(const MachineOperand &UserOp) const {
   return VLOp;
 }
 
+/// Return true if MI is an instruction used for assembling registers
+/// for segmented store instructions, namely, RISCVISD::TUPLE_INSERT.
+/// Currently it's lowered to INSERT_SUBREG.
+static bool isTupleInsertInstr(const MachineInstr &MI) {
+  if (!MI.isInsertSubreg())
+    return false;
+
+  const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+  const TargetRegisterClass *DstRC = MRI.getRegClass(MI.getOperand(0).getReg());
+  const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+  if (!RISCVRI::isVRegClass(DstRC->TSFlags))
+    return false;
+  unsigned NF = RISCVRI::getNF(DstRC->TSFlags);
+  if (NF < 2)
+    return false;
+
+  // Check whether INSERT_SUBREG has the correct subreg index for tuple inserts.
+  auto VLMul = RISCVRI::getLMul(DstRC->TSFlags);
+  unsigned SubRegIdx = MI.getOperand(3).getImm();
+  [[maybe_unused]] auto [LMul, IsFractional] = RISCVVType::decodeVLMUL(VLMul);
+  assert(!IsFractional && "unexpected LMUL for tuple register classes");
+  return TRI->getSubRegIdxSize(SubRegIdx) == RISCV::RVVBitsPerBlock * LMul;
+}
+
+static bool isSegmentedStoreInstr(const MachineInstr &MI) {
+  switch (RISCV::getRVVMCOpcode(MI.getOpcode())) {
+  case VSSEG_CASES(8):
+  case VSSSEG_CASES(8):
+  case VSUXSEG_CASES(8):
+  case VSOXSEG_CASES(8):
+  case VSSEG_CASES(16):
+  case VSSSEG_CASES(16):
+  case VSUXSEG_CASES(16):
+  case VSOXSEG_CASES(16):
+  case VSSEG_CASES(32):
+  case VSSSEG_CASES(32):
+  case VSUXSEG_CASES(32):
+  case VSOXSEG_CASES(32):
+  case VSSEG_CASES(64):
+  case VSSSEG_CASES(64):
+  case VSUXSEG_CASES(64):
+  case VSOXSEG_CASES(64):
+    return true;
+  default:
+    return false;
+  }
+}
+
 std::optional<MachineOperand>
 RISCVVLOptimizer::checkUsers(const MachineInstr &MI) const {
   std::optional<MachineOperand> CommonVL;
@@ -1396,6 +1470,23 @@ RISCVVLOptimizer::checkUsers(const MachineInstr &MI) const {
       continue;
     }
 
+    if (isTupleInsertInstr(UserMI)) {
+      LLVM_DEBUG(dbgs().indent(4) << "Peeking through uses of INSERT_SUBREG\n");
+      for (MachineOperand &UseOp :
+           MRI->use_operands(UserMI.getOperand(0).getReg())) {
+        const MachineInstr &CandidateMI = *UseOp.getParent();
+        // We should not propagate the VL if the user is not a segmented store
+        // or another INSERT_SUBREG, since VL just works differently
+        // between segmented operations (per-field) v.s. other RVV ops (on the
+        // whole register group).
+        if (!isTupleInsertInstr(CandidateMI) &&
+            !isSegmentedStoreInstr(CandidateMI))
+          return std::nullopt;
+        Worklist.insert(&UseOp);
+      }
+      continue;
+    }
+
     if (UserMI.isPHI()) {
       // Don't follow PHI cycles
       if (!PHISeen.insert(&UserMI).second)
@@ -1425,9 +1516,8 @@ RISCVVLOptimizer::checkUsers(const MachineInstr &MI) const {
       return std::nullopt;
     }
 
-    std::optional<OperandInfo> ConsumerInfo = getOperandInfo(UserOp, MRI);
-    std::optional<OperandInfo> ProducerInfo =
-        getOperandInfo(MI.getOperand(0), MRI);
+    std::optional<OperandInfo> ConsumerInfo = getOperandInfo(UserOp);
+    std::optional<OperandInfo> ProducerInfo = getOperandInfo(MI.getOperand(0));
     if (!ConsumerInfo || !ProducerInfo) {
       LLVM_DEBUG(dbgs() << "    Abort due to unknown operand information.\n");
       LLVM_DEBUG(dbgs() << "      ConsumerInfo is: " << ConsumerInfo << "\n");
@@ -1449,7 +1539,7 @@ RISCVVLOptimizer::checkUsers(const MachineInstr &MI) const {
 }
 
 bool RISCVVLOptimizer::tryReduceVL(MachineInstr &MI) const {
-  LLVM_DEBUG(dbgs() << "Trying to reduce VL for " << MI << "\n");
+  LLVM_DEBUG(dbgs() << "Trying to reduce VL for " << MI);
 
   unsigned VLOpNum = RISCVII::getVLOpNum(MI.getDesc());
   MachineOperand &VLOp = MI.getOperand(VLOpNum);
@@ -1468,14 +1558,23 @@ bool RISCVVLOptimizer::tryReduceVL(MachineInstr &MI) const {
   assert((CommonVL->isImm() || CommonVL->getReg().isVirtual()) &&
          "Expected VL to be an Imm or virtual Reg");
 
+  // If the VL is defined by a vleff that doesn't dominate MI, try using the
+  // vleff's AVL. It will be greater than or equal to the output VL.
+  if (CommonVL->isReg()) {
+    const MachineInstr *VLMI = MRI->getVRegDef(CommonVL->getReg());
+    if (RISCVInstrInfo::isFaultOnlyFirstLoad(*VLMI) &&
+        !MDT->dominates(VLMI, &MI))
+      CommonVL = VLMI->getOperand(RISCVII::getVLOpNum(VLMI->getDesc()));
+  }
+
   if (!RISCV::isVLKnownLE(*CommonVL, VLOp)) {
-    LLVM_DEBUG(dbgs() << "    Abort due to CommonVL not <= VLOp.\n");
+    LLVM_DEBUG(dbgs() << "  Abort due to CommonVL not <= VLOp.\n");
     return false;
   }
 
   if (CommonVL->isIdenticalTo(VLOp)) {
     LLVM_DEBUG(
-        dbgs() << "    Abort due to CommonVL == VLOp, no point in reducing.\n");
+        dbgs() << "  Abort due to CommonVL == VLOp, no point in reducing.\n");
     return false;
   }
 
@@ -1486,8 +1585,10 @@ bool RISCVVLOptimizer::tryReduceVL(MachineInstr &MI) const {
     return true;
   }
   const MachineInstr *VLMI = MRI->getVRegDef(CommonVL->getReg());
-  if (!MDT->dominates(VLMI, &MI))
+  if (!MDT->dominates(VLMI, &MI)) {
+    LLVM_DEBUG(dbgs() << "  Abort due to VL not dominating.\n");
     return false;
+  }
   LLVM_DEBUG(
       dbgs() << "  Reduce VL from " << VLOp << " to "
              << printReg(CommonVL->getReg(), MRI->getTargetRegisterInfo())
diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index 050de3d58a2f..62651185137c 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -745,12 +745,24 @@ bool RISCVVectorPeephole::foldVMergeToMask(MachineInstr &MI) const {
   if (PassthruReg && !isKnownSameDefs(PassthruReg, FalseReg))
     return false;
 
+  std::optional<std::pair<unsigned, unsigned>> NeedsCommute;
+
   // If True has a passthru operand then it needs to be the same as vmerge's
   // False, since False will be used for the result's passthru operand.
   Register TruePassthru = True.getOperand(True.getNumExplicitDefs()).getReg();
   if (RISCVII::isFirstDefTiedToFirstUse(True.getDesc()) && TruePassthru &&
-      !isKnownSameDefs(TruePassthru, FalseReg))
-    return false;
+      !isKnownSameDefs(TruePassthru, FalseReg)) {
+    // If True's passthru != False, check if it uses False in another operand
+    // and try to commute it.
+    int OtherIdx = True.findRegisterUseOperandIdx(FalseReg, TRI);
+    if (OtherIdx == -1)
+      return false;
+    unsigned OpIdx1 = OtherIdx;
+    unsigned OpIdx2 = True.getNumExplicitDefs();
+    if (!TII->findCommutedOpIndices(True, OpIdx1, OpIdx2))
+      return false;
+    NeedsCommute = {OpIdx1, OpIdx2};
+  }
 
   // Make sure it doesn't raise any observable fp exceptions, since changing the
   // active elements will affect how fflags is set.
@@ -796,6 +808,14 @@ bool RISCVVectorPeephole::foldVMergeToMask(MachineInstr &MI) const {
   if (!ensureDominates(MaskOp, True))
     return false;
 
+  if (NeedsCommute) {
+    auto [OpIdx1, OpIdx2] = *NeedsCommute;
+    [[maybe_unused]] bool Commuted =
+        TII->commuteInstruction(True, /*NewMI=*/false, OpIdx1, OpIdx2);
+    assert(Commuted && "Failed to commute True?");
+    Info = RISCV::lookupMaskedIntrinsicByUnmasked(True.getOpcode());
+  }
+
   True.setDesc(TII->get(Info->MaskedPseudo));
 
   // Insert the mask operand.
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
index f658b67a4c2a..45e88fc94144 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
@@ -12,6 +12,7 @@
 
 #include "SPIRVInstrInfo.h"
 #include "SPIRV.h"
+#include "SPIRVSubtarget.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -22,7 +23,8 @@
 
 using namespace llvm;
 
-SPIRVInstrInfo::SPIRVInstrInfo() : SPIRVGenInstrInfo() {}
+SPIRVInstrInfo::SPIRVInstrInfo(const SPIRVSubtarget &STI)
+    : SPIRVGenInstrInfo(STI) {}
 
 bool SPIRVInstrInfo::isConstantInstr(const MachineInstr &MI) const {
   switch (MI.getOpcode()) {
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
index d58dddcd8da2..72d2243fba62 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
@@ -20,12 +20,13 @@
 #include "SPIRVGenInstrInfo.inc"
 
 namespace llvm {
+class SPIRVSubtarget;
 
 class SPIRVInstrInfo : public SPIRVGenInstrInfo {
   const SPIRVRegisterInfo RI;
 
 public:
-  SPIRVInstrInfo();
+  explicit SPIRVInstrInfo(const SPIRVSubtarget &STI);
 
   const SPIRVRegisterInfo &getRegisterInfo() const { return RI; }
   bool isHeaderInstr(const MachineInstr &MI) const;
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
index f0b938d681db..8d10cd0ffb3d 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
@@ -637,8 +637,8 @@ let isReturn = 1, hasDelaySlot = 0, isBarrier = 0, isTerminator = 1, isNotDuplic
   def OpReturnValue: Op<254, (outs), (ins ID:$ret), "OpReturnValue $ret">;
   def OpUnreachable: SimpleOp<"OpUnreachable", 255>;
 }
-def OpLifetimeStart: Op<256, (outs), (ins ID:$ptr, i32imm:$sz), "OpLifetimeStart $ptr, $sz">;
-def OpLifetimeStop: Op<257, (outs), (ins ID:$ptr, i32imm:$sz), "OpLifetimeStop $ptr, $sz">;
+def OpLifetimeStart: Op<256, (outs), (ins ID:$ptr, i32imm:$sz), "OpLifetimeStart $ptr $sz">;
+def OpLifetimeStop: Op<257, (outs), (ins ID:$ptr, i32imm:$sz), "OpLifetimeStop $ptr $sz">;
 def OpDemoteToHelperInvocation: SimpleOp<"OpDemoteToHelperInvocation", 5380>;
 
 // 3.42.18 Atomic Instructions
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 98c7709acf93..3ad5528fab06 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -204,6 +204,9 @@ private:
   bool selectIntegerDotExpansion(Register ResVReg, const SPIRVType *ResType,
                                  MachineInstr &I) const;
 
+  bool selectOpIsInf(Register ResVReg, const SPIRVType *ResType,
+                     MachineInstr &I) const;
+
   template <bool Signed>
   bool selectDot4AddPacked(Register ResVReg, const SPIRVType *ResType,
                            MachineInstr &I) const;
@@ -2042,6 +2045,17 @@ bool SPIRVInstructionSelector::selectIntegerDotExpansion(
   return Result;
 }
 
+bool SPIRVInstructionSelector::selectOpIsInf(Register ResVReg,
+                                             const SPIRVType *ResType,
+                                             MachineInstr &I) const {
+  MachineBasicBlock &BB = *I.getParent();
+  return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpIsInf))
+      .addDef(ResVReg)
+      .addUse(GR.getSPIRVTypeID(ResType))
+      .addUse(I.getOperand(2).getReg())
+      .constrainAllUses(TII, TRI, RBI);
+}
+
 template <bool Signed>
 bool SPIRVInstructionSelector::selectDot4AddPacked(Register ResVReg,
                                                    const SPIRVType *ResType,
@@ -3183,6 +3197,8 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
     return selectExtInst(ResVReg, ResType, I, GL::FaceForward);
   case Intrinsic::spv_frac:
     return selectExtInst(ResVReg, ResType, I, CL::fract, GL::Fract);
+  case Intrinsic::spv_isinf:
+    return selectOpIsInf(ResVReg, ResType, I);
   case Intrinsic::spv_normalize:
     return selectExtInst(ResVReg, ResType, I, CL::normalize, GL::Normalize);
   case Intrinsic::spv_refract:
@@ -4276,9 +4292,11 @@ bool SPIRVInstructionSelector::loadHandleBeforePosition(
   uint32_t Binding = foldImm(HandleDef.getOperand(3), MRI);
   uint32_t ArraySize = foldImm(HandleDef.getOperand(4), MRI);
   Register IndexReg = HandleDef.getOperand(5).getReg();
-  bool IsNonUniform = ArraySize > 1 && foldImm(HandleDef.getOperand(6), MRI);
+  // FIXME: The IsNonUniform flag needs to be set based on resource analysis.
+  // https://github.com/llvm/llvm-project/issues/155701
+  bool IsNonUniform = false;
   std::string Name =
-      getStringValueFromReg(HandleDef.getOperand(7).getReg(), *MRI);
+      getStringValueFromReg(HandleDef.getOperand(6).getReg(), *MRI);
 
   bool IsStructuredBuffer = ResType->getOpcode() == SPIRV::OpTypePointer;
   MachineIRBuilder MIRBuilder(HandleDef);
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index 8039cf0c432f..b7e371d19086 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -124,7 +124,7 @@ getSymbolicOperandRequirements(SPIRV::OperandCategory::OperandCategory Category,
       })) {
     return {true,
             {},
-            ReqExts,
+            std::move(ReqExts),
             VersionTuple(),
             VersionTuple()}; // TODO: add versions to extensions.
   }
diff --git a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
index 55c9c4c5380b..1811492bf217 100644
--- a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
@@ -43,7 +43,7 @@ using Edge = std::pair<BasicBlock *, BasicBlock *>;
 static void partialOrderVisit(BasicBlock &Start,
                               std::function<bool(BasicBlock *)> Op) {
   PartialOrderingVisitor V(*Start.getParent());
-  V.partialOrderVisit(Start, Op);
+  V.partialOrderVisit(Start, std::move(Op));
 }
 
 // Returns the exact convergence region in the tree defined by `Node` for which
diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
index 690493fb426b..5b746a1389af 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
@@ -53,9 +53,9 @@ SPIRVSubtarget::SPIRVSubtarget(const Triple &TT, const std::string &CPU,
                                const std::string &FS,
                                const SPIRVTargetMachine &TM)
     : SPIRVGenSubtargetInfo(TT, CPU, /*TuneCPU=*/CPU, FS),
-      PointerSize(TM.getPointerSizeInBits(/* AS= */ 0)), InstrInfo(),
-      FrameLowering(initSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
-      TargetTriple(TT) {
+      PointerSize(TM.getPointerSizeInBits(/* AS= */ 0)),
+      InstrInfo(initSubtargetDependencies(CPU, FS)), FrameLowering(*this),
+      TLInfo(TM, *this), TargetTriple(TT) {
   switch (TT.getSubArch()) {
   case Triple::SPIRVSubArch_v10:
     SPIRVVersion = VersionTuple(1, 0);
diff --git a/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp b/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
index f1cd9b1ab07c..c3d60f3689e1 100644
--- a/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
+++ b/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
@@ -266,16 +266,47 @@ DecodeCoprocPairRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeCall(MCInst &Inst, unsigned insn, uint64_t Address,
-                               const MCDisassembler *Decoder);
-static DecodeStatus DecodeSIMM5(MCInst &Inst, unsigned insn, uint64_t Address,
-                                const MCDisassembler *Decoder);
-static DecodeStatus DecodeSIMM13(MCInst &Inst, unsigned insn, uint64_t Address,
-                                 const MCDisassembler *Decoder);
+static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch,
+                                     uint64_t Address, uint64_t Offset,
+                                     uint64_t Width, MCInst &MI,
+                                     const MCDisassembler *Decoder) {
+  return Decoder->tryAddingSymbolicOperand(MI, Value, Address, isBranch, Offset,
+                                           Width, /*InstSize=*/4);
+}
+
+static DecodeStatus DecodeCall(MCInst &MI, unsigned insn, uint64_t Address,
+                               const MCDisassembler *Decoder) {
+  int64_t CallOffset = SignExtend64(fieldFromInstruction(insn, 0, 30), 30) * 4;
+  if (!tryAddingSymbolicOperand(Address + CallOffset, false, Address, 0, 30, MI,
+                                Decoder))
+    MI.addOperand(MCOperand::createImm(CallOffset));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSIMM5(MCInst &MI, unsigned insn, uint64_t Address,
+                                const MCDisassembler *Decoder) {
+  assert(isUInt<5>(insn));
+  MI.addOperand(MCOperand::createImm(SignExtend64<5>(insn)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSIMM13(MCInst &MI, unsigned insn, uint64_t Address,
+                                 const MCDisassembler *Decoder) {
+  assert(isUInt<13>(insn));
+  MI.addOperand(MCOperand::createImm(SignExtend64<13>(insn)));
+  return MCDisassembler::Success;
+}
+
 template <unsigned N>
-constexpr static DecodeStatus DecodeDisp(MCInst &MI, uint32_t ImmVal,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder);
+static DecodeStatus DecodeDisp(MCInst &MI, uint32_t ImmVal, uint64_t Address,
+                               const MCDisassembler *Decoder) {
+  int64_t BranchOffset = SignExtend64(ImmVal, N) * 4;
+  if (!tryAddingSymbolicOperand(Address + BranchOffset, true, Address, 0, N, MI,
+                                Decoder))
+    MI.addOperand(MCOperand::createImm(BranchOffset));
+  return MCDisassembler::Success;
+}
+
 #include "SparcGenDisassemblerTables.inc"
 
 /// Read four bytes from the ArrayRef and return 32 bit word.
@@ -321,45 +352,3 @@ DecodeStatus SparcDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
 
   return Result;
 }
-
-static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch,
-                                     uint64_t Address, uint64_t Offset,
-                                     uint64_t Width, MCInst &MI,
-                                     const MCDisassembler *Decoder) {
-  return Decoder->tryAddingSymbolicOperand(MI, Value, Address, isBranch, Offset,
-                                           Width, /*InstSize=*/4);
-}
-
-static DecodeStatus DecodeCall(MCInst &MI, unsigned insn, uint64_t Address,
-                               const MCDisassembler *Decoder) {
-  int64_t CallOffset = SignExtend64(fieldFromInstruction(insn, 0, 30), 30) * 4;
-  if (!tryAddingSymbolicOperand(Address + CallOffset, false, Address, 0, 30, MI,
-                                Decoder))
-    MI.addOperand(MCOperand::createImm(CallOffset));
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeSIMM5(MCInst &MI, unsigned insn, uint64_t Address,
-                                const MCDisassembler *Decoder) {
-  assert(isUInt<5>(insn));
-  MI.addOperand(MCOperand::createImm(SignExtend64<5>(insn)));
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeSIMM13(MCInst &MI, unsigned insn, uint64_t Address,
-                                 const MCDisassembler *Decoder) {
-  assert(isUInt<13>(insn));
-  MI.addOperand(MCOperand::createImm(SignExtend64<13>(insn)));
-  return MCDisassembler::Success;
-}
-
-template <unsigned N>
-constexpr static DecodeStatus DecodeDisp(MCInst &MI, uint32_t ImmVal,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder) {
-  int64_t BranchOffset = SignExtend64(ImmVal, N) * 4;
-  if (!tryAddingSymbolicOperand(Address + BranchOffset, true, Address, 0, N, MI,
-                                Decoder))
-    MI.addOperand(MCOperand::createImm(BranchOffset));
-  return MCDisassembler::Success;
-}
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
index fa07578e512b..9fa60ee5229b 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
@@ -81,8 +81,16 @@ static MCRegisterInfo *createSparcMCRegisterInfo(const Triple &TT) {
 static MCSubtargetInfo *
 createSparcMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
   if (CPU.empty())
-    CPU = (TT.getArch() == Triple::sparcv9) ? "v9" : "v8";
-  return createSparcMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
+    CPU = TT.getArch() == Triple::sparcv9 ? "v9" : "v8";
+
+  MCSubtargetInfo *STI =
+      createSparcMCSubtargetInfoImpl(TT, CPU, /*TuneCPU=*/CPU, FS);
+  if (TT.isSPARC64() && !STI->hasFeature(Sparc::Feature64Bit)) {
+    FeatureBitset Features = STI->getFeatureBits();
+    STI->setFeatureBits(Features.set(Sparc::Feature64Bit));
+  }
+
+  return STI;
 }
 
 static MCTargetStreamer *
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
index a7b0538d683b..b523366e6ada 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
@@ -28,6 +28,7 @@ class MCRegisterInfo;
 class MCSubtargetInfo;
 class MCTargetOptions;
 class Target;
+class Triple;
 
 MCCodeEmitter *createSparcMCCodeEmitter(const MCInstrInfo &MCII,
                                         MCContext &Ctx);
diff --git a/llvm/lib/Target/Sparc/Sparc.td b/llvm/lib/Target/Sparc/Sparc.td
index cee671e34951..7137e5fbff4f 100644
--- a/llvm/lib/Target/Sparc/Sparc.td
+++ b/llvm/lib/Target/Sparc/Sparc.td
@@ -34,6 +34,9 @@ def FeatureNoFMULS
 def FeatureV9
   : SubtargetFeature<"v9", "IsV9", "true",
                      "Enable SPARC-V9 instructions">;
+def Feature64Bit : SubtargetFeature<"64bit", "Is64Bit", "true",
+                                    "Enable 64-bit mode", [FeatureV9]>;
+
 def FeatureV8Plus
   : SubtargetFeature<"v8plus", "IsV8Plus", "true",
                      "Enable V8+ mode, allowing use of 64-bit V9 instructions in 32-bit code">;
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index d01218f573dc..2737cca62cd2 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -1907,37 +1907,37 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
 
     // Setup Runtime library names.
     if (Subtarget->is64Bit() && !Subtarget->useSoftFloat()) {
-      setLibcallImpl(RTLIB::ADD_F128, RTLIB::_Qp_add);
-      setLibcallImpl(RTLIB::SUB_F128, RTLIB::_Qp_sub);
-      setLibcallImpl(RTLIB::MUL_F128, RTLIB::_Qp_mul);
-      setLibcallImpl(RTLIB::DIV_F128, RTLIB::_Qp_div);
-      setLibcallImpl(RTLIB::SQRT_F128, RTLIB::_Qp_sqrt);
-      setLibcallImpl(RTLIB::FPTOSINT_F128_I32, RTLIB::_Qp_qtoi);
-      setLibcallImpl(RTLIB::FPTOUINT_F128_I32, RTLIB::_Qp_qtoui);
-      setLibcallImpl(RTLIB::SINTTOFP_I32_F128, RTLIB::_Qp_itoq);
-      setLibcallImpl(RTLIB::UINTTOFP_I32_F128, RTLIB::_Qp_uitoq);
-      setLibcallImpl(RTLIB::FPTOSINT_F128_I64, RTLIB::_Qp_qtox);
-      setLibcallImpl(RTLIB::FPTOUINT_F128_I64, RTLIB::_Qp_qtoux);
-      setLibcallImpl(RTLIB::SINTTOFP_I64_F128, RTLIB::_Qp_xtoq);
-      setLibcallImpl(RTLIB::UINTTOFP_I64_F128, RTLIB::_Qp_uxtoq);
-      setLibcallImpl(RTLIB::FPEXT_F32_F128, RTLIB::_Qp_stoq);
-      setLibcallImpl(RTLIB::FPEXT_F64_F128, RTLIB::_Qp_dtoq);
-      setLibcallImpl(RTLIB::FPROUND_F128_F32, RTLIB::_Qp_qtos);
-      setLibcallImpl(RTLIB::FPROUND_F128_F64, RTLIB::_Qp_qtod);
+      setLibcallImpl(RTLIB::ADD_F128, RTLIB::impl__Qp_add);
+      setLibcallImpl(RTLIB::SUB_F128, RTLIB::impl__Qp_sub);
+      setLibcallImpl(RTLIB::MUL_F128, RTLIB::impl__Qp_mul);
+      setLibcallImpl(RTLIB::DIV_F128, RTLIB::impl__Qp_div);
+      setLibcallImpl(RTLIB::SQRT_F128, RTLIB::impl__Qp_sqrt);
+      setLibcallImpl(RTLIB::FPTOSINT_F128_I32, RTLIB::impl__Qp_qtoi);
+      setLibcallImpl(RTLIB::FPTOUINT_F128_I32, RTLIB::impl__Qp_qtoui);
+      setLibcallImpl(RTLIB::SINTTOFP_I32_F128, RTLIB::impl__Qp_itoq);
+      setLibcallImpl(RTLIB::UINTTOFP_I32_F128, RTLIB::impl__Qp_uitoq);
+      setLibcallImpl(RTLIB::FPTOSINT_F128_I64, RTLIB::impl__Qp_qtox);
+      setLibcallImpl(RTLIB::FPTOUINT_F128_I64, RTLIB::impl__Qp_qtoux);
+      setLibcallImpl(RTLIB::SINTTOFP_I64_F128, RTLIB::impl__Qp_xtoq);
+      setLibcallImpl(RTLIB::UINTTOFP_I64_F128, RTLIB::impl__Qp_uxtoq);
+      setLibcallImpl(RTLIB::FPEXT_F32_F128, RTLIB::impl__Qp_stoq);
+      setLibcallImpl(RTLIB::FPEXT_F64_F128, RTLIB::impl__Qp_dtoq);
+      setLibcallImpl(RTLIB::FPROUND_F128_F32, RTLIB::impl__Qp_qtos);
+      setLibcallImpl(RTLIB::FPROUND_F128_F64, RTLIB::impl__Qp_qtod);
     } else if (!Subtarget->useSoftFloat()) {
-      setLibcallImpl(RTLIB::ADD_F128, RTLIB::_Q_add);
-      setLibcallImpl(RTLIB::SUB_F128, RTLIB::_Q_sub);
-      setLibcallImpl(RTLIB::MUL_F128, RTLIB::_Q_mul);
-      setLibcallImpl(RTLIB::DIV_F128, RTLIB::_Q_div);
-      setLibcallImpl(RTLIB::SQRT_F128, RTLIB::_Q_sqrt);
-      setLibcallImpl(RTLIB::FPTOSINT_F128_I32, RTLIB::_Q_qtoi);
-      setLibcallImpl(RTLIB::FPTOUINT_F128_I32, RTLIB::_Q_qtou);
-      setLibcallImpl(RTLIB::SINTTOFP_I32_F128, RTLIB::_Q_itoq);
-      setLibcallImpl(RTLIB::UINTTOFP_I32_F128, RTLIB::_Q_utoq);
-      setLibcallImpl(RTLIB::FPEXT_F32_F128, RTLIB::_Q_stoq);
-      setLibcallImpl(RTLIB::FPEXT_F64_F128, RTLIB::_Q_dtoq);
-      setLibcallImpl(RTLIB::FPROUND_F128_F32, RTLIB::_Q_qtos);
-      setLibcallImpl(RTLIB::FPROUND_F128_F64, RTLIB::_Q_qtod);
+      setLibcallImpl(RTLIB::ADD_F128, RTLIB::impl__Q_add);
+      setLibcallImpl(RTLIB::SUB_F128, RTLIB::impl__Q_sub);
+      setLibcallImpl(RTLIB::MUL_F128, RTLIB::impl__Q_mul);
+      setLibcallImpl(RTLIB::DIV_F128, RTLIB::impl__Q_div);
+      setLibcallImpl(RTLIB::SQRT_F128, RTLIB::impl__Q_sqrt);
+      setLibcallImpl(RTLIB::FPTOSINT_F128_I32, RTLIB::impl__Q_qtoi);
+      setLibcallImpl(RTLIB::FPTOUINT_F128_I32, RTLIB::impl__Q_qtou);
+      setLibcallImpl(RTLIB::SINTTOFP_I32_F128, RTLIB::impl__Q_itoq);
+      setLibcallImpl(RTLIB::UINTTOFP_I32_F128, RTLIB::impl__Q_utoq);
+      setLibcallImpl(RTLIB::FPEXT_F32_F128, RTLIB::impl__Q_stoq);
+      setLibcallImpl(RTLIB::FPEXT_F64_F128, RTLIB::impl__Q_dtoq);
+      setLibcallImpl(RTLIB::FPROUND_F128_F32, RTLIB::impl__Q_qtos);
+      setLibcallImpl(RTLIB::FPROUND_F128_F64, RTLIB::impl__Q_qtod);
     }
   }
 
@@ -3510,7 +3510,7 @@ void SparcTargetLowering::ReplaceNodeResults(SDNode *N,
 
 // Override to enable LOAD_STACK_GUARD lowering on Linux.
 bool SparcTargetLowering::useLoadStackGuardNode(const Module &M) const {
-  if (!Subtarget->isTargetLinux())
+  if (!Subtarget->getTargetTriple().isOSLinux())
     return TargetLowering::useLoadStackGuardNode(M);
   return true;
 }
diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
index a7fbbd4044c1..cd0f64991298 100644
--- a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -37,8 +37,8 @@ static cl::opt<unsigned>
 // Pin the vtable to this file.
 void SparcInstrInfo::anchor() {}
 
-SparcInstrInfo::SparcInstrInfo(SparcSubtarget &ST)
-    : SparcGenInstrInfo(SP::ADJCALLSTACKDOWN, SP::ADJCALLSTACKUP), RI(),
+SparcInstrInfo::SparcInstrInfo(const SparcSubtarget &ST)
+    : SparcGenInstrInfo(ST, SP::ADJCALLSTACKDOWN, SP::ADJCALLSTACKUP), RI(),
       Subtarget(ST) {}
 
 /// isLoadFromStackSlot - If the specified machine instruction is a direct
@@ -643,7 +643,7 @@ unsigned SparcInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
 bool SparcInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   switch (MI.getOpcode()) {
   case TargetOpcode::LOAD_STACK_GUARD: {
-    assert(Subtarget.isTargetLinux() &&
+    assert(Subtarget.getTargetTriple().isOSLinux() &&
            "Only Linux target is expected to contain LOAD_STACK_GUARD");
     // offsetof(tcbhead_t, stack_guard) from sysdeps/sparc/nptl/tls.h in glibc.
     const int64_t Offset = Subtarget.is64Bit() ? 0x28 : 0x14;
diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.h b/llvm/lib/Target/Sparc/SparcInstrInfo.h
index 1feb12ba2fda..01d020473494 100644
--- a/llvm/lib/Target/Sparc/SparcInstrInfo.h
+++ b/llvm/lib/Target/Sparc/SparcInstrInfo.h
@@ -40,7 +40,7 @@ class SparcInstrInfo : public SparcGenInstrInfo {
   const SparcSubtarget& Subtarget;
   virtual void anchor();
 public:
-  explicit SparcInstrInfo(SparcSubtarget &ST);
+  explicit SparcInstrInfo(const SparcSubtarget &ST);
 
   /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
   /// such, whenever a client has an instance of instruction info, it should
diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.td b/llvm/lib/Target/Sparc/SparcInstrInfo.td
index 1a32eafb0e83..53972d6c105a 100644
--- a/llvm/lib/Target/Sparc/SparcInstrInfo.td
+++ b/llvm/lib/Target/Sparc/SparcInstrInfo.td
@@ -1785,22 +1785,22 @@ let Predicates = [HasV9], Uses = [ASR3], Constraints = "$swap = $rd" in
 // as inline assembler-supported instructions.
 let Predicates = [HasUMAC_SMAC], Defs = [Y, ASR18], Uses = [Y, ASR18] in {
   def SMACrr :  F3_1<2, 0b111111,
-                   (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2, ASRRegs:$asr18),
+                   (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2),
                    "smac $rs1, $rs2, $rd",
                    [], IIC_smac_umac>;
 
   def SMACri :  F3_2<2, 0b111111,
-                  (outs IntRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13, ASRRegs:$asr18),
+                  (outs IntRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13),
                    "smac $rs1, $simm13, $rd",
                    [], IIC_smac_umac>;
 
   def UMACrr :  F3_1<2, 0b111110,
-                  (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2, ASRRegs:$asr18),
+                  (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2),
                    "umac $rs1, $rs2, $rd",
                    [], IIC_smac_umac>;
 
   def UMACri :  F3_2<2, 0b111110,
-                  (outs IntRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13, ASRRegs:$asr18),
+                  (outs IntRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13),
                    "umac $rs1, $simm13, $rd",
                    [], IIC_smac_umac>;
 }
diff --git a/llvm/lib/Target/Sparc/SparcSubtarget.cpp b/llvm/lib/Target/Sparc/SparcSubtarget.cpp
index e42df1d68613..005930834a0c 100644
--- a/llvm/lib/Target/Sparc/SparcSubtarget.cpp
+++ b/llvm/lib/Target/Sparc/SparcSubtarget.cpp
@@ -28,10 +28,11 @@ void SparcSubtarget::anchor() { }
 
 SparcSubtarget &SparcSubtarget::initializeSubtargetDependencies(
     StringRef CPU, StringRef TuneCPU, StringRef FS) {
+  const Triple &TT = getTargetTriple();
   // Determine default and user specified characteristics
   std::string CPUName = std::string(CPU);
   if (CPUName.empty())
-    CPUName = (Is64Bit) ? "v9" : "v8";
+    CPUName = TT.isSPARC64() ? "v9" : "v8";
 
   if (TuneCPU.empty())
     TuneCPU = CPUName;
@@ -39,6 +40,12 @@ SparcSubtarget &SparcSubtarget::initializeSubtargetDependencies(
   // Parse features string.
   ParseSubtargetFeatures(CPUName, TuneCPU, FS);
 
+  if (!Is64Bit && TT.isSPARC64()) {
+    FeatureBitset Features = getFeatureBits();
+    setFeatureBits(Features.set(Sparc::Feature64Bit));
+    Is64Bit = true;
+  }
+
   // Popc is a v9-only instruction.
   if (!IsV9)
     UsePopc = false;
@@ -47,11 +54,9 @@ SparcSubtarget &SparcSubtarget::initializeSubtargetDependencies(
 }
 
 SparcSubtarget::SparcSubtarget(const StringRef &CPU, const StringRef &TuneCPU,
-                               const StringRef &FS, const TargetMachine &TM,
-                               bool is64Bit)
+                               const StringRef &FS, const TargetMachine &TM)
     : SparcGenSubtargetInfo(TM.getTargetTriple(), CPU, TuneCPU, FS),
       ReserveRegister(TM.getMCRegisterInfo()->getNumRegs()),
-      TargetTriple(TM.getTargetTriple()), Is64Bit(is64Bit),
       InstrInfo(initializeSubtargetDependencies(CPU, TuneCPU, FS)),
       TLInfo(TM, *this), FrameLowering(*this) {
   TSInfo = std::make_unique<SparcSelectionDAGInfo>();
diff --git a/llvm/lib/Target/Sparc/SparcSubtarget.h b/llvm/lib/Target/Sparc/SparcSubtarget.h
index 5785c199f44b..b1decca0a4f0 100644
--- a/llvm/lib/Target/Sparc/SparcSubtarget.h
+++ b/llvm/lib/Target/Sparc/SparcSubtarget.h
@@ -34,11 +34,8 @@ class SparcSubtarget : public SparcGenSubtargetInfo {
   // register.
   BitVector ReserveRegister;
 
-  Triple TargetTriple;
   virtual void anchor();
 
-  bool Is64Bit;
-
 #define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER)                    \
   bool ATTRIBUTE = DEFAULT;
 #include "SparcGenSubtargetInfo.inc"
@@ -50,7 +47,7 @@ class SparcSubtarget : public SparcGenSubtargetInfo {
 
 public:
   SparcSubtarget(const StringRef &CPU, const StringRef &TuneCPU,
-                 const StringRef &FS, const TargetMachine &TM, bool is64bit);
+                 const StringRef &FS, const TargetMachine &TM);
 
   ~SparcSubtarget() override;
 
@@ -80,8 +77,6 @@ public:
                                                   StringRef TuneCPU,
                                                   StringRef FS);
 
-  bool is64Bit() const { return Is64Bit; }
-
   /// The 64-bit ABI uses biased stack and frame pointers, so the stack frame
   /// of the current function is the area from [%sp+BIAS] to [%fp+BIAS].
   int64_t getStackPointerBias() const {
@@ -96,8 +91,6 @@ public:
   /// returns adjusted framesize which includes space for register window
   /// spills and arguments.
   int getAdjustedFrameSize(int stackSize) const;
-
-  bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/Sparc/SparcTargetMachine.cpp b/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
index 52076a6b4dd2..754c8f63ca4e 100644
--- a/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -38,7 +38,9 @@ static cl::opt<bool>
     BranchRelaxation("sparc-enable-branch-relax", cl::Hidden, cl::init(true),
                      cl::desc("Relax out of range conditional branches"));
 
-static std::string computeDataLayout(const Triple &T, bool is64Bit) {
+static std::string computeDataLayout(const Triple &T) {
+  const bool is64Bit = T.isSPARC64();
+
   // Sparc is typically big endian, but some are little.
   std::string Ret = T.getArch() == Triple::sparcel ? "e" : "E";
   Ret += "-m:e";
@@ -107,15 +109,14 @@ SparcTargetMachine::SparcTargetMachine(const Target &T, const Triple &TT,
                                        const TargetOptions &Options,
                                        std::optional<Reloc::Model> RM,
                                        std::optional<CodeModel::Model> CM,
-                                       CodeGenOptLevel OL, bool JIT,
-                                       bool is64bit)
+                                       CodeGenOptLevel OL, bool JIT)
     : CodeGenTargetMachineImpl(
-          T, computeDataLayout(TT, is64bit), TT, CPU, FS, Options,
+          T, computeDataLayout(TT), TT, CPU, FS, Options,
           getEffectiveRelocModel(RM),
-          getEffectiveSparcCodeModel(CM, getEffectiveRelocModel(RM), is64bit,
-                                     JIT),
+          getEffectiveSparcCodeModel(CM, getEffectiveRelocModel(RM),
+                                     TT.isSPARC64(), JIT),
           OL),
-      TLOF(std::make_unique<SparcELFTargetObjectFile>()), is64Bit(is64bit) {
+      TLOF(std::make_unique<SparcELFTargetObjectFile>()) {
   initAsmInfo();
 }
 
@@ -148,8 +149,7 @@ SparcTargetMachine::getSubtargetImpl(const Function &F) const {
     // creation will depend on the TM and the code generation flags on the
     // function that reside in TargetOptions.
     resetTargetOptions(F);
-    I = std::make_unique<SparcSubtarget>(CPU, TuneCPU, FS, *this,
-                                         this->is64Bit);
+    I = std::make_unique<SparcSubtarget>(CPU, TuneCPU, FS, *this);
   }
   return I.get();
 }
@@ -212,7 +212,7 @@ SparcV8TargetMachine::SparcV8TargetMachine(const Target &T, const Triple &TT,
                                            std::optional<Reloc::Model> RM,
                                            std::optional<CodeModel::Model> CM,
                                            CodeGenOptLevel OL, bool JIT)
-    : SparcTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, JIT, false) {}
+    : SparcTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, JIT) {}
 
 void SparcV9TargetMachine::anchor() { }
 
@@ -222,7 +222,7 @@ SparcV9TargetMachine::SparcV9TargetMachine(const Target &T, const Triple &TT,
                                            std::optional<Reloc::Model> RM,
                                            std::optional<CodeModel::Model> CM,
                                            CodeGenOptLevel OL, bool JIT)
-    : SparcTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, JIT, true) {}
+    : SparcTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, JIT) {}
 
 void SparcelTargetMachine::anchor() {}
 
@@ -232,4 +232,4 @@ SparcelTargetMachine::SparcelTargetMachine(const Target &T, const Triple &TT,
                                            std::optional<Reloc::Model> RM,
                                            std::optional<CodeModel::Model> CM,
                                            CodeGenOptLevel OL, bool JIT)
-    : SparcTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, JIT, false) {}
+    : SparcTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, JIT) {}
diff --git a/llvm/lib/Target/Sparc/SparcTargetMachine.h b/llvm/lib/Target/Sparc/SparcTargetMachine.h
index 9a226a47671b..e7d038c5779d 100644
--- a/llvm/lib/Target/Sparc/SparcTargetMachine.h
+++ b/llvm/lib/Target/Sparc/SparcTargetMachine.h
@@ -23,7 +23,6 @@ namespace llvm {
 
 class SparcTargetMachine : public CodeGenTargetMachineImpl {
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
-  bool is64Bit;
   mutable StringMap<std::unique_ptr<SparcSubtarget>> SubtargetMap;
 
 public:
@@ -31,7 +30,7 @@ public:
                      StringRef FS, const TargetOptions &Options,
                      std::optional<Reloc::Model> RM,
                      std::optional<CodeModel::Model> CM, CodeGenOptLevel OL,
-                     bool JIT, bool is64bit);
+                     bool JIT);
   ~SparcTargetMachine() override;
 
   const SparcSubtarget *getSubtargetImpl(const Function &F) const override;
diff --git a/llvm/lib/Target/SystemZ/SystemZFeatures.td b/llvm/lib/Target/SystemZ/SystemZFeatures.td
index 2c48da8320fb..4ccc3d3079fc 100644
--- a/llvm/lib/Target/SystemZ/SystemZFeatures.td
+++ b/llvm/lib/Target/SystemZ/SystemZFeatures.td
@@ -196,7 +196,7 @@ def FeatureVector : SystemZFeature<
 >;
 def FeatureNoVector : SystemZMissingFeature<"Vector">;
 
-def NoVecHwMode : HwMode<"-vector", [FeatureNoVector]>;
+def NoVecHwMode : HwMode<[FeatureNoVector]>;
 
 def Arch11NewFeatures : SystemZFeatureList<[
     FeatureLoadAndZeroRightmostByte,
@@ -426,4 +426,3 @@ def Arch9UnsupportedFeatures
   : SystemZFeatureAdd<Arch10UnsupportedFeatures.List, Arch10NewFeatures.List>;
 def Arch8UnsupportedFeatures
   : SystemZFeatureAdd<Arch9UnsupportedFeatures.List,  Arch9NewFeatures.List>;
-
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index c73dc3021eb4..3b7d11a318dc 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -287,6 +287,9 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
     // Additional instructions available with z17.
     if (Subtarget.hasVectorEnhancements3()) {
       setOperationAction(ISD::ABS, MVT::i128, Legal);
+
+      setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX},
+                         MVT::i128, Legal);
     }
   }
 
@@ -492,6 +495,9 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
       // Map SETCCs onto one of VCE, VCH or VCHL, swapping the operands
       // and inverting the result as necessary.
       setOperationAction(ISD::SETCC, VT, Custom);
+
+      setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, VT,
+                         Legal);
     }
   }
 
@@ -6719,6 +6725,14 @@ SDValue SystemZTargetLowering::lowerFSHL(SDValue Op, SelectionDAG &DAG) const {
     if ((ShiftAmt & 7) == 0 || Subtarget.hasVectorEnhancements2()) {
       SDValue Op0 = DAG.getBitcast(MVT::v16i8, Op.getOperand(0));
       SDValue Op1 = DAG.getBitcast(MVT::v16i8, Op.getOperand(1));
+      if (ShiftAmt > 120) {
+        // For N in 121..128, fshl N == fshr (128 - N), and for 1 <= N < 8
+        // SHR_DOUBLE_BIT emits fewer instructions.
+        SDValue Val =
+            DAG.getNode(SystemZISD::SHR_DOUBLE_BIT, DL, MVT::v16i8, Op0, Op1,
+                        DAG.getTargetConstant(128 - ShiftAmt, DL, MVT::i32));
+        return DAG.getBitcast(MVT::i128, Val);
+      }
       SmallVector<int, 16> Mask(16);
       for (unsigned Elt = 0; Elt < 16; Elt++)
         Mask[Elt] = (ShiftAmt >> 3) + Elt;
@@ -6742,13 +6756,21 @@ SDValue SystemZTargetLowering::lowerFSHR(SDValue Op, SelectionDAG &DAG) const {
   // i128 FSHR with a constant amount that is a multiple of 8 can be
   // implemented via VECTOR_SHUFFLE.  If we have the vector-enhancements-2
   // facility, FSHR with a constant amount less than 8 can be implemented
-  // via SHL_DOUBLE_BIT, and FSHR with other constant amounts by a
+  // via SHR_DOUBLE_BIT, and FSHR with other constant amounts by a
   // combination of the two.
   if (auto *ShiftAmtNode = dyn_cast<ConstantSDNode>(Op.getOperand(2))) {
     uint64_t ShiftAmt = ShiftAmtNode->getZExtValue() & 127;
     if ((ShiftAmt & 7) == 0 || Subtarget.hasVectorEnhancements2()) {
       SDValue Op0 = DAG.getBitcast(MVT::v16i8, Op.getOperand(0));
       SDValue Op1 = DAG.getBitcast(MVT::v16i8, Op.getOperand(1));
+      if (ShiftAmt > 120) {
+        // For N in 121..128, fshr N == fshl (128 - N), and for 1 <= N < 8
+        // SHL_DOUBLE_BIT emits fewer instructions.
+        SDValue Val =
+            DAG.getNode(SystemZISD::SHL_DOUBLE_BIT, DL, MVT::v16i8, Op0, Op1,
+                        DAG.getTargetConstant(128 - ShiftAmt, DL, MVT::i32));
+        return DAG.getBitcast(MVT::i128, Val);
+      }
       SmallVector<int, 16> Mask(16);
       for (unsigned Elt = 0; Elt < 16; Elt++)
         Mask[Elt] = 16 - (ShiftAmt >> 3) + Elt;
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
index d0a549518cc4..82415f412509 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -2646,28 +2646,24 @@ class BranchUnaryRI<string mnemonic, bits<12> opcode, RegisterOperand cls>
   : InstRIb<opcode, (outs cls:$R1), (ins cls:$R1src, brtarget16:$RI2),
             mnemonic#"\t$R1, $RI2", []> {
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
 }
 
 class BranchUnaryRIL<string mnemonic, bits<12> opcode, RegisterOperand cls>
   : InstRILb<opcode, (outs cls:$R1), (ins cls:$R1src, brtarget32:$RI2),
              mnemonic#"\t$R1, $RI2", []> {
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
 }
 
 class BranchUnaryRR<string mnemonic, bits<8> opcode, RegisterOperand cls>
   : InstRR<opcode, (outs cls:$R1), (ins cls:$R1src, GR64:$R2),
            mnemonic#"\t$R1, $R2", []> {
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
 }
 
 class BranchUnaryRRE<string mnemonic, bits<16> opcode, RegisterOperand cls>
   : InstRRE<opcode, (outs cls:$R1), (ins cls:$R1src, GR64:$R2),
             mnemonic#"\t$R1, $R2", []> {
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
 }
 
 class BranchUnaryRX<string mnemonic, bits<8> opcode, RegisterOperand cls>
@@ -2675,7 +2671,6 @@ class BranchUnaryRX<string mnemonic, bits<8> opcode, RegisterOperand cls>
             (ins cls:$R1src, (bdxaddr12only $B2, $D2, $X2):$XBD2),
             mnemonic#"\t$R1, $XBD2", []> {
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
 }
 
 class BranchUnaryRXY<string mnemonic, bits<16> opcode, RegisterOperand cls>
@@ -2683,14 +2678,12 @@ class BranchUnaryRXY<string mnemonic, bits<16> opcode, RegisterOperand cls>
              (ins cls:$R1src, (bdxaddr20only $B2, $D2, $X2):$XBD2),
              mnemonic#"\t$R1, $XBD2", []> {
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
 }
 
 class BranchBinaryRSI<string mnemonic, bits<8> opcode, RegisterOperand cls>
   : InstRSI<opcode, (outs cls:$R1), (ins cls:$R1src, cls:$R3, brtarget16:$RI2),
             mnemonic#"\t$R1, $R3, $RI2", []> {
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
 }
 
 class BranchBinaryRIEe<string mnemonic, bits<16> opcode, RegisterOperand cls>
@@ -2698,7 +2691,6 @@ class BranchBinaryRIEe<string mnemonic, bits<16> opcode, RegisterOperand cls>
              (ins cls:$R1src, cls:$R3, brtarget16:$RI2),
              mnemonic#"\t$R1, $R3, $RI2", []> {
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
 }
 
 class BranchBinaryRS<string mnemonic, bits<8> opcode, RegisterOperand cls>
@@ -2706,7 +2698,6 @@ class BranchBinaryRS<string mnemonic, bits<8> opcode, RegisterOperand cls>
             (ins cls:$R1src, cls:$R3, (bdaddr12only $B2, $D2):$BD2),
             mnemonic#"\t$R1, $R3, $BD2", []> {
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
 }
 
 class BranchBinaryRSY<string mnemonic, bits<16> opcode, RegisterOperand cls>
@@ -2715,7 +2706,6 @@ class BranchBinaryRSY<string mnemonic, bits<16> opcode, RegisterOperand cls>
              (ins cls:$R1src, cls:$R3, (bdaddr20only $B2, $D2):$BD2),
              mnemonic#"\t$R1, $R3, $BD2", []> {
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
 }
 
 class LoadMultipleRS<string mnemonic, bits<8> opcode, RegisterOperand cls,
@@ -3116,7 +3106,6 @@ class UnaryTiedRRE<string mnemonic, bits<16> opcode, RegisterOperand cls>
   : InstRRE<opcode, (outs cls:$R1), (ins cls:$R1src),
             mnemonic#"\t$R1", []> {
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
   let R2 = 0;
 }
 
@@ -3125,7 +3114,6 @@ class UnaryMemRRFc<string mnemonic, bits<16> opcode,
   : InstRRFc<opcode, (outs cls2:$R2, cls1:$R1), (ins cls1:$R1src),
             mnemonic#"\t$R1, $R2", []> {
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
   let M3 = 0;
 }
 
@@ -3163,7 +3151,6 @@ class CondUnaryRSY<string mnemonic, bits<16> opcode,
                    (z_select_ccmask (operator bdaddr20only:$BD2), cls:$R1src,
                                     cond4:$valid, cond4:$M3))]> {
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
   let mayLoad = 1;
   let AccessBytes = bytes;
   let CCMaskLast = 1;
@@ -3184,7 +3171,6 @@ class AsmCondUnaryRSY<string mnemonic, bits<16> opcode,
   let mayLoad = 1;
   let AccessBytes = bytes;
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
 }
 
 // Like CondUnaryRSY, but with a fixed CC mask.
@@ -3194,7 +3180,6 @@ class FixedCondUnaryRSY<CondVariant V, string mnemonic, bits<16> opcode,
   : InstRSYb<opcode, (outs cls:$R1), (ins cls:$R1src, (mode $B2, $D2):$BD2),
              mnemonic#V.suffix#"\t$R1, $BD2", []> {
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
   let mayLoad = 1;
   let AccessBytes = bytes;
   let isAsmParserOnly = V.alternate;
@@ -3439,7 +3424,6 @@ class SideEffectBinaryMemMemRR<string mnemonic, bits<8> opcode,
   : InstRR<opcode, (outs cls1:$R1, cls2:$R2), (ins cls1:$R1src, cls2:$R2src),
            mnemonic#"\t$R1, $R2", []> {
     let Constraints = "$R1 = $R1src, $R2 = $R2src";
-    let DisableEncoding = "$R1src, $R2src";
 }
 
 class SideEffectBinaryMemRRE<string mnemonic, bits<16> opcode,
@@ -3447,7 +3431,6 @@ class SideEffectBinaryMemRRE<string mnemonic, bits<16> opcode,
   : InstRRE<opcode, (outs cls2:$R2), (ins cls1:$R1, cls2:$R2src),
             mnemonic#"\t$R1, $R2", []> {
   let Constraints = "$R2 = $R2src";
-  let DisableEncoding = "$R2src";
 }
 
 class SideEffectBinaryMemMemRRE<string mnemonic, bits<16> opcode,
@@ -3455,7 +3438,6 @@ class SideEffectBinaryMemMemRRE<string mnemonic, bits<16> opcode,
   : InstRRE<opcode, (outs cls1:$R1, cls2:$R2), (ins cls1:$R1src, cls2:$R2src),
             mnemonic#"\t$R1, $R2", []> {
     let Constraints = "$R1 = $R1src, $R2 = $R2src";
-    let DisableEncoding = "$R1src, $R2src";
 }
 
 class SideEffectBinaryMemMemRRFc<string mnemonic, bits<16> opcode,
@@ -3463,7 +3445,6 @@ class SideEffectBinaryMemMemRRFc<string mnemonic, bits<16> opcode,
   : InstRRFc<opcode, (outs cls1:$R1, cls2:$R2), (ins cls1:$R1src, cls2:$R2src),
              mnemonic#"\t$R1, $R2", []> {
   let Constraints = "$R1 = $R1src, $R2 = $R2src";
-  let DisableEncoding = "$R1src, $R2src";
   let M3 = 0;
 }
 
@@ -3475,7 +3456,6 @@ class BinaryRR<string mnemonic, bits<8> opcode, SDPatternOperator operator,
   let OpKey = mnemonic#cls1;
   let OpType = "reg";
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
 }
 
 class BinaryRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
@@ -3486,7 +3466,6 @@ class BinaryRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let OpKey = mnemonic#cls1;
   let OpType = "reg";
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
 }
 
 class BinaryRRD<string mnemonic, bits<16> opcode, SDPatternOperator operator,
@@ -3565,7 +3544,6 @@ class BinaryMemRRFc<string mnemonic, bits<16> opcode,
   : InstRRFc<opcode, (outs cls2:$R2, cls1:$R1), (ins cls1:$R1src, imm:$M3),
             mnemonic#"\t$R1, $R2, $M3", []> {
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
 }
 
 multiclass BinaryMemRRFcOpt<string mnemonic, bits<16> opcode,
@@ -3594,7 +3572,6 @@ class CondBinaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
              [(set cls1:$R1, (z_select_ccmask cls2:$R2, cls1:$R1src,
                                               cond4:$valid, cond4:$M3))]> {
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
   let CCMaskLast = 1;
   let NumOpsKey = !subst("loc", "sel", mnemonic);
   let NumOpsValue = "2";
@@ -3610,7 +3587,6 @@ class AsmCondBinaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
              (ins cls1:$R1src, cls2:$R2, imm32zx4:$M3),
              mnemonic#"\t$R1, $R2, $M3", []> {
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
 }
 
 // Like CondBinaryRRF, but with a fixed CC mask.
@@ -3619,7 +3595,6 @@ class FixedCondBinaryRRF<CondVariant V, string mnemonic, bits<16> opcode,
   : InstRRFc<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2),
              mnemonic#V.suffix#"\t$R1, $R2", []> {
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
   let isAsmParserOnly = V.alternate;
   let AsmVariantName = V.asmvariant;
   let M3 = V.ccmask;
@@ -3678,7 +3653,6 @@ class BinaryRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
             mnemonic#"\t$R1, $I2",
             [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
 }
 
 class BinaryRIE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
@@ -3707,7 +3681,6 @@ class CondBinaryRIE<string mnemonic, bits<16> opcode, RegisterOperand cls,
              [(set cls:$R1, (z_select_ccmask imm:$I2, cls:$R1src,
                                              cond4:$valid, cond4:$M3))]> {
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
   let CCMaskLast = 1;
 }
 
@@ -3719,7 +3692,6 @@ class AsmCondBinaryRIE<string mnemonic, bits<16> opcode, RegisterOperand cls,
              (ins cls:$R1src, imm:$I2, imm32zx4:$M3),
              mnemonic#"\t$R1, $I2, $M3", []> {
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
 }
 
 // Like CondBinaryRIE, but with a fixed CC mask.
@@ -3728,7 +3700,6 @@ class FixedCondBinaryRIE<CondVariant V, string mnemonic, bits<16> opcode,
   : InstRIEg<opcode, (outs cls:$R1), (ins cls:$R1src, imm:$I2),
              mnemonic#V.suffix#"\t$R1, $I2", []> {
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
   let isAsmParserOnly = V.alternate;
   let AsmVariantName = V.asmvariant;
   let M3 = V.ccmask;
@@ -3747,7 +3718,6 @@ class BinaryRIL<string mnemonic, bits<12> opcode, SDPatternOperator operator,
              mnemonic#"\t$R1, $I2",
              [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
 }
 
 class BinaryRS<string mnemonic, bits<8> opcode, SDPatternOperator operator,
@@ -3758,7 +3728,6 @@ class BinaryRS<string mnemonic, bits<8> opcode, SDPatternOperator operator,
             [(set cls:$R1, (operator cls:$R1src, shift12only:$BD2))]> {
   let R3 = 0;
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
 }
 
 class BinaryRSY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
@@ -3794,7 +3763,6 @@ class BinaryRX<string mnemonic, bits<8> opcode, SDPatternOperator operator,
   let OpKey = mnemonic#"r"#cls;
   let OpType = "mem";
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
   let mayLoad = 1;
   let AccessBytes = bytes;
 }
@@ -3809,7 +3777,6 @@ class BinaryRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let OpKey = mnemonic#"r"#cls;
   let OpType = "mem";
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
   let mayLoad = 1;
   let AccessBytes = bytes;
   let M3 = 0;
@@ -3838,7 +3805,6 @@ class BinaryRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let OpKey = mnemonic#"r"#cls;
   let OpType = "mem";
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
   let mayLoad = 1;
   let AccessBytes = bytes;
 }
@@ -4500,7 +4466,6 @@ class SideEffectTernaryMemMemRRFa<string mnemonic, bits<16> opcode,
              (ins cls1:$R1src, cls2:$R2src, cls3:$R3),
              mnemonic#"\t$R1, $R2, $R3", []> {
   let Constraints = "$R1 = $R1src, $R2 = $R2src";
-  let DisableEncoding = "$R1src, $R2src";
   let M4 = 0;
 }
 
@@ -4520,7 +4485,6 @@ class SideEffectTernaryMemMemMemRRFb<string mnemonic, bits<16> opcode,
              (ins cls1:$R1src, cls2:$R2src, cls3:$R3src),
              mnemonic#"\t$R1, $R3, $R2", []> {
   let Constraints = "$R1 = $R1src, $R2 = $R2src, $R3 = $R3src";
-  let DisableEncoding = "$R1src, $R2src, $R3src";
   let M4 = 0;
 }
 
@@ -4544,7 +4508,6 @@ class SideEffectTernaryMemMemRRFc<string mnemonic, bits<16> opcode,
              (ins cls1:$R1src, cls2:$R2src, imm:$M3),
              mnemonic#"\t$R1, $R2, $M3", []> {
   let Constraints = "$R1 = $R1src, $R2 = $R2src";
-  let DisableEncoding = "$R1src, $R2src";
 }
 
 multiclass SideEffectTernaryMemMemRRFcOpt<string mnemonic, bits<16> opcode,
@@ -4574,7 +4537,6 @@ class TernaryRRFb<string mnemonic, bits<16> opcode,
              (ins cls1:$R1src, cls2:$R2, imm32zx4:$M4),
              mnemonic#"\t$R1, $R3, $R2, $M4", []> {
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
 }
 
 class TernaryRRFe<string mnemonic, bits<16> opcode, RegisterOperand cls1,
@@ -4591,7 +4553,6 @@ class TernaryRRD<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let OpKey = mnemonic#cls;
   let OpType = "reg";
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
 }
 
 class TernaryRS<string mnemonic, bits<8> opcode, RegisterOperand cls,
@@ -4601,7 +4562,6 @@ class TernaryRS<string mnemonic, bits<8> opcode, RegisterOperand cls,
             mnemonic#"\t$R1, $M3, $BD2", []> {
 
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
   let mayLoad = 1;
   let AccessBytes = bytes;
 }
@@ -4613,7 +4573,6 @@ class TernaryRSY<string mnemonic, bits<16> opcode, RegisterOperand cls,
              mnemonic#"\t$R1, $M3, $BD2", []> {
 
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
   let mayLoad = 1;
   let AccessBytes = bytes;
 }
@@ -4646,7 +4605,6 @@ class SideEffectTernaryMemMemRS<string mnemonic, bits<8> opcode,
             (ins cls1:$R1src, cls2:$R3src, (shift12only $B2, $D2):$BD2),
             mnemonic#"\t$R1, $R3, $BD2", []> {
     let Constraints = "$R1 = $R1src, $R3 = $R3src";
-    let DisableEncoding = "$R1src, $R3src";
 }
 
 class SideEffectTernaryMemMemRSY<string mnemonic, bits<16> opcode,
@@ -4655,7 +4613,6 @@ class SideEffectTernaryMemMemRSY<string mnemonic, bits<16> opcode,
              (ins cls1:$R1src, cls2:$R3src, (shift20only $B2, $D2):$BD2),
              mnemonic#"\t$R1, $R3, $BD2", []> {
     let Constraints = "$R1 = $R1src, $R3 = $R3src";
-    let DisableEncoding = "$R1src, $R3src";
 }
 
 class TernaryRXF<string mnemonic, bits<16> opcode, SDPatternOperator operator,
@@ -4669,7 +4626,6 @@ class TernaryRXF<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let OpKey = mnemonic#"r"#cls;
   let OpType = "mem";
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
   let mayLoad = 1;
   let AccessBytes = bytes;
 }
@@ -4681,7 +4637,6 @@ class TernaryVRIa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
              [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V1src),
                                                   imm:$I2, index:$M3))]> {
   let Constraints = "$V1 = $V1src";
-  let DisableEncoding = "$V1src";
 }
 
 class TernaryVRId<string mnemonic, bits<16> opcode, SDPatternOperator operator,
@@ -4893,7 +4848,6 @@ class TernaryVRSb<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                                                   cls:$R3,
                                                   shift12only:$BD2))]> {
   let Constraints = "$V1 = $V1src";
-  let DisableEncoding = "$V1src";
   let M4 = type;
 }
 
@@ -4913,7 +4867,6 @@ class TernaryVRSbGeneric<string mnemonic, bits<16> opcode>
                   imm32zx4:$M4),
              mnemonic#"\t$V1, $R3, $BD2, $M4", []> {
   let Constraints = "$V1 = $V1src";
-  let DisableEncoding = "$V1src";
 }
 
 class TernaryVRV<string mnemonic, bits<16> opcode, bits<5> bytes,
@@ -4922,7 +4875,6 @@ class TernaryVRV<string mnemonic, bits<16> opcode, bits<5> bytes,
            (ins VR128:$V1src, (bdvaddr12only $B2, $D2, $V2):$VBD2, index:$M3),
            mnemonic#"\t$V1, $VBD2, $M3", []> {
   let Constraints = "$V1 = $V1src";
-  let DisableEncoding = "$V1src";
   let mayLoad = 1;
   let AccessBytes = bytes;
 }
@@ -4936,7 +4888,6 @@ class TernaryVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                                                 bdxaddr12only:$XBD2,
                                                 index:$M3))]> {
   let Constraints = "$V1 = $V1src";
-  let DisableEncoding = "$V1src";
   let mayLoad = 1;
   let AccessBytes = bytes;
 }
@@ -4951,7 +4902,6 @@ class QuaternaryVRId<string mnemonic, bits<16> opcode, SDPatternOperator operato
                                                   (tr2.vt tr2.op:$V3),
                                                   imm32zx8_timm:$I4))]> {
   let Constraints = "$V1 = $V1src";
-  let DisableEncoding = "$V1src";
   let M5 = type;
 }
 
@@ -4961,7 +4911,6 @@ class QuaternaryVRIdGeneric<string mnemonic, bits<16> opcode>
                   imm32zx8:$I4, imm32zx4:$M5),
              mnemonic#"\t$V1, $V2, $V3, $I4, $M5", []> {
   let Constraints = "$V1 = $V1src";
-  let DisableEncoding = "$V1src";
 }
 
 class QuaternaryVRIf<string mnemonic, bits<16> opcode>
@@ -5087,7 +5036,6 @@ class CmpSwapRRE<string mnemonic, bits<16> opcode,
   : InstRRE<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2),
             mnemonic#"\t$R1, $R2", []> {
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
   let mayLoad = 1;
   let mayStore = 1;
 }
@@ -5099,7 +5047,6 @@ class CmpSwapRS<string mnemonic, bits<8> opcode, SDPatternOperator operator,
             mnemonic#"\t$R1, $R3, $BD2",
             [(set cls:$R1, (operator mode:$BD2, cls:$R1src, cls:$R3))]> {
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
   let mayLoad = 1;
   let mayStore = 1;
 }
@@ -5111,7 +5058,6 @@ class CmpSwapRSY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
              mnemonic#"\t$R1, $R3, $BD2",
              [(set cls:$R1, (operator mode:$BD2, cls:$R1src, cls:$R3))]> {
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
   let mayLoad = 1;
   let mayStore = 1;
 }
@@ -5128,7 +5074,7 @@ multiclass CmpSwapRSPair<string mnemonic, bits<8> rsOpcode, bits<16> rsyOpcode,
 
 multiclass RotateSelectRIEf<string mnemonic, bits<16> opcode, RegisterOperand cls1,
                        RegisterOperand cls2, bits<8> I3Or = 0, bits<8> I4Or = 0> {
-  let Constraints = "$R1 = $R1src", DisableEncoding = "$R1src" in {
+  let Constraints = "$R1 = $R1src" in {
   def "" : InstRIEf<opcode, (outs cls1:$R1),
              (ins cls1:$R1src, cls2:$R2, imm32zx8:$I3, imm32zx8:$I4,
                   imm32zx8:$I5),
@@ -5328,7 +5274,6 @@ class CondBinaryRRFPseudo<string mnemonic, RegisterOperand cls1,
            [(set cls1:$R1, (z_select_ccmask cls2:$R2, cls1:$R1src,
                                             cond4:$valid, cond4:$M3))]> {
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
   let CCMaskLast = 1;
   let NumOpsKey = !subst("loc", "sel", mnemonic);
   let NumOpsValue = "2";
@@ -5359,7 +5304,6 @@ class CondBinaryRIEPseudo<RegisterOperand cls, ImmOpWithPattern imm>
            [(set cls:$R1, (z_select_ccmask imm:$I2, cls:$R1src,
                                            cond4:$valid, cond4:$M3))]> {
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
   let CCMaskLast = 1;
 }
 
@@ -5374,7 +5318,6 @@ class CondUnaryRSYPseudo<string mnemonic, SDPatternOperator operator,
                  (z_select_ccmask (operator mode:$BD2), cls:$R1src,
                                   cond4:$valid, cond4:$R3))]> {
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
   let mayLoad = 1;
   let AccessBytes = bytes;
   let CCMaskLast = 1;
@@ -5414,7 +5357,6 @@ class RotateSelectRIEfPseudo<RegisterOperand cls1, RegisterOperand cls2>
                 imm32zx8:$I5),
            []> {
   let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
 }
 
 // Implements "$dst = $cc & (8 >> CC) ? $src1 : $src2", where CC is
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 783f86aecce4..2e21f27c9032 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -59,8 +59,8 @@ static uint64_t allOnes(unsigned int Count) {
 // Pin the vtable to this file.
 void SystemZInstrInfo::anchor() {}
 
-SystemZInstrInfo::SystemZInstrInfo(SystemZSubtarget &sti)
-    : SystemZGenInstrInfo(-1, -1),
+SystemZInstrInfo::SystemZInstrInfo(const SystemZSubtarget &sti)
+    : SystemZGenInstrInfo(sti, -1, -1),
       RI(sti.getSpecialRegisters()->getReturnFunctionAddressRegister(),
          sti.getHwMode()),
       STI(sti) {}
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
index 8b82af61e669..7b9ad7b87a14 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -184,7 +184,7 @@ MachineBasicBlock *splitBlockBefore(MachineBasicBlock::iterator MI,
 
 class SystemZInstrInfo : public SystemZGenInstrInfo {
   const SystemZRegisterInfo RI;
-  SystemZSubtarget &STI;
+  const SystemZSubtarget &STI;
 
   void splitMove(MachineBasicBlock::iterator MI, unsigned NewOpcode) const;
   void splitAdjDynAlloc(MachineBasicBlock::iterator MI) const;
@@ -225,7 +225,7 @@ protected:
                                        unsigned CommuteOpIdx2) const override;
 
 public:
-  explicit SystemZInstrInfo(SystemZSubtarget &STI);
+  explicit SystemZInstrInfo(const SystemZSubtarget &STI);
 
   // Override TargetInstrInfo.
   Register isLoadFromStackSlot(const MachineInstr &MI,
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
index 10de8b05cf45..479bab5ce62b 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
@@ -680,41 +680,41 @@ let Predicates = [FeatureVector] in {
   let isCommutable = 1 in {
     // Maximum.
     def VMX  : BinaryVRRcGeneric<"vmx", 0xE7FF>;
-    def VMXB : BinaryVRRc<"vmxb", 0xE7FF, null_frag, v128b, v128b, 0>;
-    def VMXH : BinaryVRRc<"vmxh", 0xE7FF, null_frag, v128h, v128h, 1>;
-    def VMXF : BinaryVRRc<"vmxf", 0xE7FF, null_frag, v128f, v128f, 2>;
-    def VMXG : BinaryVRRc<"vmxg", 0xE7FF, null_frag, v128g, v128g, 3>;
+    def VMXB : BinaryVRRc<"vmxb", 0xE7FF, smax, v128b, v128b, 0>;
+    def VMXH : BinaryVRRc<"vmxh", 0xE7FF, smax, v128h, v128h, 1>;
+    def VMXF : BinaryVRRc<"vmxf", 0xE7FF, smax, v128f, v128f, 2>;
+    def VMXG : BinaryVRRc<"vmxg", 0xE7FF, smax, v128g, v128g, 3>;
     let Predicates = [FeatureVectorEnhancements3] in
-      def VMXQ : BinaryVRRc<"vmxq", 0xE7FF, null_frag, v128q, v128q, 4>;
+      def VMXQ : BinaryVRRc<"vmxq", 0xE7FF, smax, v128q, v128q, 4>;
 
     // Maximum logical.
     def VMXL  : BinaryVRRcGeneric<"vmxl", 0xE7FD>;
-    def VMXLB : BinaryVRRc<"vmxlb", 0xE7FD, null_frag, v128b, v128b, 0>;
-    def VMXLH : BinaryVRRc<"vmxlh", 0xE7FD, null_frag, v128h, v128h, 1>;
-    def VMXLF : BinaryVRRc<"vmxlf", 0xE7FD, null_frag, v128f, v128f, 2>;
-    def VMXLG : BinaryVRRc<"vmxlg", 0xE7FD, null_frag, v128g, v128g, 3>;
+    def VMXLB : BinaryVRRc<"vmxlb", 0xE7FD, umax, v128b, v128b, 0>;
+    def VMXLH : BinaryVRRc<"vmxlh", 0xE7FD, umax, v128h, v128h, 1>;
+    def VMXLF : BinaryVRRc<"vmxlf", 0xE7FD, umax, v128f, v128f, 2>;
+    def VMXLG : BinaryVRRc<"vmxlg", 0xE7FD, umax, v128g, v128g, 3>;
     let Predicates = [FeatureVectorEnhancements3] in
-      def VMXLQ : BinaryVRRc<"vmxlq", 0xE7FD, null_frag, v128q, v128q, 4>;
+      def VMXLQ : BinaryVRRc<"vmxlq", 0xE7FD, umax, v128q, v128q, 4>;
   }
 
   let isCommutable = 1 in {
     // Minimum.
     def VMN  : BinaryVRRcGeneric<"vmn", 0xE7FE>;
-    def VMNB : BinaryVRRc<"vmnb", 0xE7FE, null_frag, v128b, v128b, 0>;
-    def VMNH : BinaryVRRc<"vmnh", 0xE7FE, null_frag, v128h, v128h, 1>;
-    def VMNF : BinaryVRRc<"vmnf", 0xE7FE, null_frag, v128f, v128f, 2>;
-    def VMNG : BinaryVRRc<"vmng", 0xE7FE, null_frag, v128g, v128g, 3>;
+    def VMNB : BinaryVRRc<"vmnb", 0xE7FE, smin, v128b, v128b, 0>;
+    def VMNH : BinaryVRRc<"vmnh", 0xE7FE, smin, v128h, v128h, 1>;
+    def VMNF : BinaryVRRc<"vmnf", 0xE7FE, smin, v128f, v128f, 2>;
+    def VMNG : BinaryVRRc<"vmng", 0xE7FE, smin, v128g, v128g, 3>;
     let Predicates = [FeatureVectorEnhancements3] in
-      def VMNQ : BinaryVRRc<"vmnq", 0xE7FE, null_frag, v128q, v128q, 4>;
+      def VMNQ : BinaryVRRc<"vmnq", 0xE7FE, smin, v128q, v128q, 4>;
 
     // Minimum logical.
     def VMNL  : BinaryVRRcGeneric<"vmnl", 0xE7FC>;
-    def VMNLB : BinaryVRRc<"vmnlb", 0xE7FC, null_frag, v128b, v128b, 0>;
-    def VMNLH : BinaryVRRc<"vmnlh", 0xE7FC, null_frag, v128h, v128h, 1>;
-    def VMNLF : BinaryVRRc<"vmnlf", 0xE7FC, null_frag, v128f, v128f, 2>;
-    def VMNLG : BinaryVRRc<"vmnlg", 0xE7FC, null_frag, v128g, v128g, 3>;
+    def VMNLB : BinaryVRRc<"vmnlb", 0xE7FC, umin, v128b, v128b, 0>;
+    def VMNLH : BinaryVRRc<"vmnlh", 0xE7FC, umin, v128h, v128h, 1>;
+    def VMNLF : BinaryVRRc<"vmnlf", 0xE7FC, umin, v128f, v128f, 2>;
+    def VMNLG : BinaryVRRc<"vmnlg", 0xE7FC, umin, v128g, v128g, 3>;
     let Predicates = [FeatureVectorEnhancements3] in
-      def VMNLQ : BinaryVRRc<"vmnlq", 0xE7FC, null_frag, v128q, v128q, 4>;
+      def VMNLQ : BinaryVRRc<"vmnlq", 0xE7FC, umin, v128q, v128q, 4>;
   }
 
   let isCommutable = 1 in {
@@ -1250,54 +1250,45 @@ defm : IntegerAbsoluteVectorOps<v8i16, VLCH, VLPH, 15>;
 defm : IntegerAbsoluteVectorOps<v4i32, VLCF, VLPF, 31>;
 defm : IntegerAbsoluteVectorOps<v2i64, VLCG, VLPG, 63>;
 
-// Instantiate minimum- and maximum-related patterns for TYPE.  CMPH is the
-// signed or unsigned "set if greater than" comparison instruction and
-// MIN and MAX are the associated minimum and maximum instructions.
-multiclass IntegerMinMaxVectorOps<ValueType type, SDPatternOperator cmph,
-                                  Instruction min, Instruction max> {
-  let Predicates = [FeatureVector] in {
-    def : Pat<(type (vselect (cmph VR128:$x, VR128:$y), VR128:$x, VR128:$y)),
-              (max VR128:$x, VR128:$y)>;
-    def : Pat<(type (vselect (cmph VR128:$x, VR128:$y), VR128:$y, VR128:$x)),
-              (min VR128:$x, VR128:$y)>;
-    def : Pat<(type (vselect (z_vnot (cmph VR128:$x, VR128:$y)),
-                             VR128:$x, VR128:$y)),
-              (min VR128:$x, VR128:$y)>;
-    def : Pat<(type (vselect (z_vnot (cmph VR128:$x, VR128:$y)),
-                             VR128:$y, VR128:$x)),
-              (max VR128:$x, VR128:$y)>;
-  }
+// Instantiate packs/packu: recognize a saturating truncation and convert
+// into the corresponding packs/packu instruction.
+multiclass SignedSaturatingTruncate<ValueType input, ValueType output,
+                                    Instruction packs> {
+  def : Pat<
+    (output (z_pack
+      (smin (smax (input VR128:$a), ssat_trunc_min_vec), ssat_trunc_max_vec),
+      (smin (smax (input VR128:$b), ssat_trunc_min_vec), ssat_trunc_max_vec)
+    )),
+    (packs VR128:$a, VR128:$b)
+  >;
+
+  def : Pat<
+    (output (z_pack
+      (smax (smin (input VR128:$a), ssat_trunc_max_vec), ssat_trunc_min_vec),
+      (smax (smin (input VR128:$b), ssat_trunc_max_vec), ssat_trunc_min_vec)
+    )),
+    (packs VR128:$a, VR128:$b)
+  >;
 }
 
-// Signed min/max.
-defm : IntegerMinMaxVectorOps<v16i8, z_vicmph, VMNB, VMXB>;
-defm : IntegerMinMaxVectorOps<v8i16, z_vicmph, VMNH, VMXH>;
-defm : IntegerMinMaxVectorOps<v4i32, z_vicmph, VMNF, VMXF>;
-defm : IntegerMinMaxVectorOps<v2i64, z_vicmph, VMNG, VMXG>;
-
-let Predicates = [FeatureVectorEnhancements3] in {
-  def : Pat<(i128 (or (and VR128:$x, (z_vicmph VR128:$x, VR128:$y)),
-                      (and VR128:$y, (not (z_vicmph VR128:$x, VR128:$y))))),
-            (VMXQ VR128:$x, VR128:$y)>;
-  def : Pat<(i128 (or (and VR128:$y, (z_vicmph VR128:$x, VR128:$y)),
-                      (and VR128:$x, (not (z_vicmph VR128:$x, VR128:$y))))),
-            (VMNQ VR128:$x, VR128:$y)>;
+defm : SignedSaturatingTruncate<v8i16, v16i8, VPKSH>;
+defm : SignedSaturatingTruncate<v4i32, v8i16, VPKSF>;
+defm : SignedSaturatingTruncate<v2i64, v4i32, VPKSG>;
+
+multiclass UnsignedSaturatingTruncate<ValueType input, ValueType output,
+                                      Instruction packu> {
+  def : Pat<
+    (output (z_pack
+      (umin (input VR128:$a), usat_trunc_max_vec),
+      (umin (input VR128:$b), usat_trunc_max_vec)
+    )),
+    (packu VR128:$a, VR128:$b)
+  >;
 }
 
-// Unsigned min/max.
-defm : IntegerMinMaxVectorOps<v16i8, z_vicmphl, VMNLB, VMXLB>;
-defm : IntegerMinMaxVectorOps<v8i16, z_vicmphl, VMNLH, VMXLH>;
-defm : IntegerMinMaxVectorOps<v4i32, z_vicmphl, VMNLF, VMXLF>;
-defm : IntegerMinMaxVectorOps<v2i64, z_vicmphl, VMNLG, VMXLG>;
-
-let Predicates = [FeatureVectorEnhancements3] in {
-  def : Pat<(i128 (or (and VR128:$x, (z_vicmphl VR128:$x, VR128:$y)),
-                      (and VR128:$y, (not (z_vicmphl VR128:$x, VR128:$y))))),
-            (VMXLQ VR128:$x, VR128:$y)>;
-  def : Pat<(i128 (or (and VR128:$y, (z_vicmphl VR128:$x, VR128:$y)),
-                      (and VR128:$x, (not (z_vicmphl VR128:$x, VR128:$y))))),
-            (VMNLQ VR128:$x, VR128:$y)>;
-}
+defm : UnsignedSaturatingTruncate<v8i16, v16i8, VPKLSH>;
+defm : UnsignedSaturatingTruncate<v4i32, v8i16, VPKLSF>;
+defm : UnsignedSaturatingTruncate<v2i64, v4i32, VPKLSG>;
 
 // Instantiate comparison patterns to recognize VACC/VSCBI for TYPE.
 multiclass IntegerComputeCarryOrBorrow<ValueType type,
diff --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td
index 39e216b993b1..547d3dcf9280 100644
--- a/llvm/lib/Target/SystemZ/SystemZOperators.td
+++ b/llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -1067,6 +1067,31 @@ def vsplat_imm_eq_1 : PatFrag<(ops), (build_vector), [{
 }]>;
 def z_vzext1 : PatFrag<(ops node:$x), (and node:$x, vsplat_imm_eq_1)>;
 
+// Vector constants for saturating truncation, containing the minimum and
+// maximum value for the integer type that is half of the element width.
+def ssat_trunc_min_vec: PatFrag<(ops), (build_vector), [{
+  APInt Imm;
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+  unsigned SizeInBits = EltTy.getSizeInBits();
+  APInt min = APInt::getSignedMinValue(SizeInBits / 2).sext(SizeInBits);
+  return ISD::isConstantSplatVector(N, Imm) && APInt::isSameValue(Imm, min);
+}]>;
+def ssat_trunc_max_vec: PatFrag<(ops), (build_vector), [{
+  APInt Imm;
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+  unsigned SizeInBits = EltTy.getSizeInBits();
+  APInt max = APInt::getSignedMaxValue(SizeInBits / 2).sext(SizeInBits);
+  return ISD::isConstantSplatVector(N, Imm) && APInt::isSameValue(Imm, max);
+}]>;
+
+def usat_trunc_max_vec: PatFrag<(ops), (build_vector), [{
+  APInt Imm;
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+  unsigned SizeInBits = EltTy.getSizeInBits();
+  APInt max = APInt::getMaxValue(SizeInBits / 2).zext(SizeInBits);
+  return ISD::isConstantSplatVector(N, Imm) && APInt::isSameValue(Imm, max);
+}]>;
+
 // Signed "integer greater than zero" on vectors.
 def z_vicmph_zero : PatFrag<(ops node:$x), (z_vicmph node:$x, immAllZerosV)>;
 
diff --git a/llvm/lib/Target/TargetLoweringObjectFile.cpp b/llvm/lib/Target/TargetLoweringObjectFile.cpp
index 28495e7c5719..343bcce80e3a 100644
--- a/llvm/lib/Target/TargetLoweringObjectFile.cpp
+++ b/llvm/lib/Target/TargetLoweringObjectFile.cpp
@@ -191,8 +191,9 @@ void TargetLoweringObjectFile::emitCGProfileMetadata(MCStreamer &Streamer,
   }
 }
 
-void TargetLoweringObjectFile::emitPseudoProbeDescMetadata(MCStreamer &Streamer,
-                                                           Module &M) const {
+void TargetLoweringObjectFile::emitPseudoProbeDescMetadata(
+    MCStreamer &Streamer, Module &M,
+    std::function<void(MCStreamer &Streamer)> COMDATSymEmitter) const {
   NamedMDNode *FuncInfo = M.getNamedMetadata(PseudoProbeDescMetadataName);
   if (!FuncInfo)
     return;
@@ -213,6 +214,11 @@ void TargetLoweringObjectFile::emitPseudoProbeDescMetadata(MCStreamer &Streamer,
         TM->getFunctionSections() ? Name->getString() : StringRef());
 
     Streamer.switchSection(S);
+
+    // emit COFF COMDAT symbol.
+    if (COMDATSymEmitter)
+      COMDATSymEmitter(Streamer);
+
     Streamer.emitInt64(GUID->getZExtValue());
     Streamer.emitInt64(Hash->getZExtValue());
     Streamer.emitULEB128IntValue(Name->getString().size());
diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp
index 69b6e26e602f..ad7e503cb155 100644
--- a/llvm/lib/Target/TargetMachine.cpp
+++ b/llvm/lib/Target/TargetMachine.cpp
@@ -162,7 +162,6 @@ void TargetMachine::resetTargetOptions(const Function &F) const {
   RESET_OPTION(NoInfsFPMath, "no-infs-fp-math");
   RESET_OPTION(NoNaNsFPMath, "no-nans-fp-math");
   RESET_OPTION(NoSignedZerosFPMath, "no-signed-zeros-fp-math");
-  RESET_OPTION(ApproxFuncFPMath, "approx-func-fp-math");
 }
 
 /// Returns the code generation relocation model. The choices are static, PIC,
diff --git a/llvm/lib/Target/TargetMachineC.cpp b/llvm/lib/Target/TargetMachineC.cpp
index da6d35c8c8b4..aba6ea436e76 100644
--- a/llvm/lib/Target/TargetMachineC.cpp
+++ b/llvm/lib/Target/TargetMachineC.cpp
@@ -83,7 +83,8 @@ LLVMBool LLVMGetTargetFromTriple(const char* TripleStr, LLVMTargetRef *T,
                                  char **ErrorMessage) {
   std::string Error;
 
-  *T = wrap(TargetRegistry::lookupTarget(TripleStr, Error));
+  Triple TT(TripleStr);
+  *T = wrap(TargetRegistry::lookupTarget(TT, Error));
 
   if (!*T) {
     if (ErrorMessage)
diff --git a/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp b/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
index d7e1666a7417..aad826b5f285 100644
--- a/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
+++ b/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
@@ -57,6 +57,7 @@ LLVMInitializeVEDisassembler() {
                                          createVEDisassembler);
 }
 
+// clang-format off
 static const unsigned I32RegDecoderTable[] = {
     VE::SW0,  VE::SW1,  VE::SW2,  VE::SW3,  VE::SW4,  VE::SW5,  VE::SW6,
     VE::SW7,  VE::SW8,  VE::SW9,  VE::SW10, VE::SW11, VE::SW12, VE::SW13,
@@ -127,6 +128,7 @@ static const unsigned MiscRegDecoderTable[] = {
     VE::PMC4,       VE::PMC5,       VE::PMC6,       VE::PMC7,
     VE::PMC8,       VE::PMC9,       VE::PMC10,      VE::PMC11,
     VE::PMC12,      VE::PMC13,      VE::PMC14};
+// clang-format on
 
 static DecodeStatus DecodeI32RegisterClass(MCInst &Inst, unsigned RegNo,
                                            uint64_t Address,
@@ -214,106 +216,6 @@ static DecodeStatus DecodeMISCRegisterClass(MCInst &Inst, unsigned RegNo,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeASX(MCInst &Inst, uint64_t insn, uint64_t Address,
-                              const MCDisassembler *Decoder);
-static DecodeStatus DecodeLoadI32(MCInst &Inst, uint64_t insn, uint64_t Address,
-                                  const MCDisassembler *Decoder);
-static DecodeStatus DecodeStoreI32(MCInst &Inst, uint64_t insn,
-                                   uint64_t Address,
-                                   const MCDisassembler *Decoder);
-static DecodeStatus DecodeLoadI64(MCInst &Inst, uint64_t insn, uint64_t Address,
-                                  const MCDisassembler *Decoder);
-static DecodeStatus DecodeStoreI64(MCInst &Inst, uint64_t insn,
-                                   uint64_t Address,
-                                   const MCDisassembler *Decoder);
-static DecodeStatus DecodeLoadF32(MCInst &Inst, uint64_t insn, uint64_t Address,
-                                  const MCDisassembler *Decoder);
-static DecodeStatus DecodeStoreF32(MCInst &Inst, uint64_t insn,
-                                   uint64_t Address,
-                                   const MCDisassembler *Decoder);
-static DecodeStatus DecodeLoadASI64(MCInst &Inst, uint64_t insn,
-                                    uint64_t Address,
-                                    const MCDisassembler *Decoder);
-static DecodeStatus DecodeStoreASI64(MCInst &Inst, uint64_t insn,
-                                     uint64_t Address,
-                                     const MCDisassembler *Decoder);
-static DecodeStatus DecodeTS1AMI64(MCInst &Inst, uint64_t insn,
-                                   uint64_t Address,
-                                   const MCDisassembler *Decoder);
-static DecodeStatus DecodeTS1AMI32(MCInst &Inst, uint64_t insn,
-                                   uint64_t Address,
-                                   const MCDisassembler *Decoder);
-static DecodeStatus DecodeCASI64(MCInst &Inst, uint64_t insn, uint64_t Address,
-                                 const MCDisassembler *Decoder);
-static DecodeStatus DecodeCASI32(MCInst &Inst, uint64_t insn, uint64_t Address,
-                                 const MCDisassembler *Decoder);
-static DecodeStatus DecodeCall(MCInst &Inst, uint64_t insn, uint64_t Address,
-                               const MCDisassembler *Decoder);
-static DecodeStatus DecodeSIMM7(MCInst &Inst, uint64_t insn, uint64_t Address,
-                                const MCDisassembler *Decoder);
-static DecodeStatus DecodeSIMM32(MCInst &Inst, uint64_t insn, uint64_t Address,
-                                 const MCDisassembler *Decoder);
-static DecodeStatus DecodeCCOperand(MCInst &Inst, uint64_t insn,
-                                    uint64_t Address,
-                                    const MCDisassembler *Decoder);
-static DecodeStatus DecodeRDOperand(MCInst &Inst, uint64_t insn,
-                                    uint64_t Address,
-                                    const MCDisassembler *Decoder);
-static DecodeStatus DecodeBranchCondition(MCInst &Inst, uint64_t insn,
-                                          uint64_t Address,
-                                          const MCDisassembler *Decoder);
-static DecodeStatus DecodeBranchConditionAlways(MCInst &Inst, uint64_t insn,
-                                                uint64_t Address,
-                                                const MCDisassembler *Decoder);
-
-#include "VEGenDisassemblerTables.inc"
-
-/// Read four bytes from the ArrayRef and return 32 bit word.
-static DecodeStatus readInstruction64(ArrayRef<uint8_t> Bytes, uint64_t Address,
-                                      uint64_t &Size, uint64_t &Insn,
-                                      bool IsLittleEndian) {
-  // We want to read exactly 8 Bytes of data.
-  if (Bytes.size() < 8) {
-    Size = 0;
-    return MCDisassembler::Fail;
-  }
-
-  Insn = IsLittleEndian
-             ? ((uint64_t)Bytes[0] << 0) | ((uint64_t)Bytes[1] << 8) |
-                   ((uint64_t)Bytes[2] << 16) | ((uint64_t)Bytes[3] << 24) |
-                   ((uint64_t)Bytes[4] << 32) | ((uint64_t)Bytes[5] << 40) |
-                   ((uint64_t)Bytes[6] << 48) | ((uint64_t)Bytes[7] << 56)
-             : ((uint64_t)Bytes[7] << 0) | ((uint64_t)Bytes[6] << 8) |
-                   ((uint64_t)Bytes[5] << 16) | ((uint64_t)Bytes[4] << 24) |
-                   ((uint64_t)Bytes[3] << 32) | ((uint64_t)Bytes[2] << 40) |
-                   ((uint64_t)Bytes[1] << 48) | ((uint64_t)Bytes[0] << 56);
-
-  return MCDisassembler::Success;
-}
-
-DecodeStatus VEDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
-                                            ArrayRef<uint8_t> Bytes,
-                                            uint64_t Address,
-                                            raw_ostream &CStream) const {
-  uint64_t Insn;
-  bool isLittleEndian = getContext().getAsmInfo()->isLittleEndian();
-  DecodeStatus Result =
-      readInstruction64(Bytes, Address, Size, Insn, isLittleEndian);
-  if (Result == MCDisassembler::Fail)
-    return MCDisassembler::Fail;
-
-  // Calling the auto-generated decoder function.
-
-  Result = decodeInstruction(DecoderTableVE64, Instr, Insn, Address, this, STI);
-
-  if (Result != MCDisassembler::Fail) {
-    Size = 8;
-    return Result;
-  }
-
-  return MCDisassembler::Fail;
-}
-
 typedef DecodeStatus (*DecodeFunc)(MCInst &MI, unsigned RegNo, uint64_t Address,
                                    const MCDisassembler *Decoder);
 
@@ -629,3 +531,51 @@ static DecodeStatus DecodeBranchConditionAlways(MCInst &MI, uint64_t insn,
   // Decode MEMri.
   return DecodeAS(MI, insn, Address, Decoder);
 }
+
+#include "VEGenDisassemblerTables.inc"
+
+/// Read four bytes from the ArrayRef and return 32 bit word.
+static DecodeStatus readInstruction64(ArrayRef<uint8_t> Bytes, uint64_t Address,
+                                      uint64_t &Size, uint64_t &Insn,
+                                      bool IsLittleEndian) {
+  // We want to read exactly 8 Bytes of data.
+  if (Bytes.size() < 8) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  Insn = IsLittleEndian
+             ? ((uint64_t)Bytes[0] << 0) | ((uint64_t)Bytes[1] << 8) |
+                   ((uint64_t)Bytes[2] << 16) | ((uint64_t)Bytes[3] << 24) |
+                   ((uint64_t)Bytes[4] << 32) | ((uint64_t)Bytes[5] << 40) |
+                   ((uint64_t)Bytes[6] << 48) | ((uint64_t)Bytes[7] << 56)
+             : ((uint64_t)Bytes[7] << 0) | ((uint64_t)Bytes[6] << 8) |
+                   ((uint64_t)Bytes[5] << 16) | ((uint64_t)Bytes[4] << 24) |
+                   ((uint64_t)Bytes[3] << 32) | ((uint64_t)Bytes[2] << 40) |
+                   ((uint64_t)Bytes[1] << 48) | ((uint64_t)Bytes[0] << 56);
+
+  return MCDisassembler::Success;
+}
+
+DecodeStatus VEDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
+                                            ArrayRef<uint8_t> Bytes,
+                                            uint64_t Address,
+                                            raw_ostream &CStream) const {
+  uint64_t Insn;
+  bool isLittleEndian = getContext().getAsmInfo()->isLittleEndian();
+  DecodeStatus Result =
+      readInstruction64(Bytes, Address, Size, Insn, isLittleEndian);
+  if (Result == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+
+  // Calling the auto-generated decoder function.
+
+  Result = decodeInstruction(DecoderTableVE64, Instr, Insn, Address, this, STI);
+
+  if (Result != MCDisassembler::Fail) {
+    Size = 8;
+    return Result;
+  }
+
+  return MCDisassembler::Fail;
+}
diff --git a/llvm/lib/Target/VE/VEInstrInfo.cpp b/llvm/lib/Target/VE/VEInstrInfo.cpp
index 98e4b452a8a5..d5e804afd27f 100644
--- a/llvm/lib/Target/VE/VEInstrInfo.cpp
+++ b/llvm/lib/Target/VE/VEInstrInfo.cpp
@@ -34,8 +34,8 @@ using namespace llvm;
 // Pin the vtable to this file.
 void VEInstrInfo::anchor() {}
 
-VEInstrInfo::VEInstrInfo(VESubtarget &ST)
-    : VEGenInstrInfo(VE::ADJCALLSTACKDOWN, VE::ADJCALLSTACKUP), RI() {}
+VEInstrInfo::VEInstrInfo(const VESubtarget &ST)
+    : VEGenInstrInfo(ST, VE::ADJCALLSTACKDOWN, VE::ADJCALLSTACKUP), RI() {}
 
 static bool IsIntegerCC(unsigned CC) { return (CC < VECC::CC_AF); }
 
diff --git a/llvm/lib/Target/VE/VEInstrInfo.h b/llvm/lib/Target/VE/VEInstrInfo.h
index 49dcba503462..408d3ab9e05f 100644
--- a/llvm/lib/Target/VE/VEInstrInfo.h
+++ b/llvm/lib/Target/VE/VEInstrInfo.h
@@ -53,7 +53,7 @@ class VEInstrInfo : public VEGenInstrInfo {
   virtual void anchor();
 
 public:
-  explicit VEInstrInfo(VESubtarget &ST);
+  explicit VEInstrInfo(const VESubtarget &ST);
 
   /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
   /// such, whenever a client has an instance of instruction info, it should
diff --git a/llvm/lib/Target/VE/VEInstrInfo.td b/llvm/lib/Target/VE/VEInstrInfo.td
index 7e3f29b3bd82..9869f95ae566 100644
--- a/llvm/lib/Target/VE/VEInstrInfo.td
+++ b/llvm/lib/Target/VE/VEInstrInfo.td
@@ -39,6 +39,8 @@ include "VEInstrFormats.td"
 //     e.g. 0.0 (0x00000000) or -2.0 (0xC0000000=(2)1).
 //===----------------------------------------------------------------------===//
 
+defvar ve_ptr_rc = I64;
+
 def ULO7 : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(N->getZExtValue() & 0x7f,
                                    SDLoc(N), MVT::i32);
@@ -325,17 +327,17 @@ def VEMEMziiAsmOperand : AsmOperandClass {
 // ASX format uses single assembly instruction format.
 def MEMrri : Operand<iPTR> {
   let PrintMethod = "printMemASXOperand";
-  let MIOperandInfo = (ops ptr_rc, ptr_rc, i64imm);
+  let MIOperandInfo = (ops ve_ptr_rc, ve_ptr_rc, i64imm);
   let ParserMatchClass = VEMEMrriAsmOperand;
 }
 def MEMrii : Operand<iPTR> {
   let PrintMethod = "printMemASXOperand";
-  let MIOperandInfo = (ops ptr_rc, i32imm, i64imm);
+  let MIOperandInfo = (ops ve_ptr_rc, i32imm, i64imm);
   let ParserMatchClass = VEMEMriiAsmOperand;
 }
 def MEMzri : Operand<iPTR> {
   let PrintMethod = "printMemASXOperand";
-  let MIOperandInfo = (ops i32imm /* = 0 */, ptr_rc, i64imm);
+  let MIOperandInfo = (ops i32imm /* = 0 */, ve_ptr_rc, i64imm);
   let ParserMatchClass = VEMEMzriAsmOperand;
 }
 def MEMzii : Operand<iPTR> {
@@ -358,7 +360,7 @@ def VEMEMziAsmOperand : AsmOperandClass {
 //   1. AS generic assembly instruction format:
 def MEMriASX : Operand<iPTR> {
   let PrintMethod = "printMemASOperandASX";
-  let MIOperandInfo = (ops ptr_rc, i32imm);
+  let MIOperandInfo = (ops ve_ptr_rc, i32imm);
   let ParserMatchClass = VEMEMriAsmOperand;
 }
 def MEMziASX : Operand<iPTR> {
@@ -370,7 +372,7 @@ def MEMziASX : Operand<iPTR> {
 //   2. AS RRM style assembly instruction format:
 def MEMriRRM : Operand<iPTR> {
   let PrintMethod = "printMemASOperandRRM";
-  let MIOperandInfo = (ops ptr_rc, i32imm);
+  let MIOperandInfo = (ops ve_ptr_rc, i32imm);
   let ParserMatchClass = VEMEMriAsmOperand;
 }
 def MEMziRRM : Operand<iPTR> {
@@ -382,7 +384,7 @@ def MEMziRRM : Operand<iPTR> {
 //   3. AS HM style assembly instruction format:
 def MEMriHM : Operand<iPTR> {
   let PrintMethod = "printMemASOperandHM";
-  let MIOperandInfo = (ops ptr_rc, i32imm);
+  let MIOperandInfo = (ops ve_ptr_rc, i32imm);
   let ParserMatchClass = VEMEMriAsmOperand;
 }
 def MEMziHM : Operand<iPTR> {
@@ -642,7 +644,7 @@ multiclass RRIm<string opcStr, bits<8>opc,
 
 // Special RR multiclass for 128 bits shift left instruction.
 //   e.g. SLD
-let Constraints = "$hi = $sx", DisableEncoding = "$hi", hasSideEffects = 0 in
+let Constraints = "$hi = $sx", hasSideEffects = 0 in
 multiclass RRILDm<string opcStr, bits<8>opc, RegisterClass RC> {
   def rrr : RR<opc, (outs RC:$sx), (ins RC:$hi, RC:$sz, I32:$sy),
               !strconcat(opcStr, " $sx, $sz, $sy")>;
@@ -659,7 +661,7 @@ multiclass RRILDm<string opcStr, bits<8>opc, RegisterClass RC> {
 
 // Special RR multiclass for 128 bits shift right instruction.
 //   e.g. SRD
-let Constraints = "$low = $sx", DisableEncoding = "$low", hasSideEffects = 0 in
+let Constraints = "$low = $sx", hasSideEffects = 0 in
 multiclass RRIRDm<string opcStr, bits<8>opc, RegisterClass RC> {
   def rrr : RR<opc, (outs RC:$sx), (ins RC:$sz, RC:$low, I32:$sy),
               !strconcat(opcStr, " $sx, $sz, $sy")>;
@@ -689,7 +691,7 @@ multiclass RRI1m<string opcStr, bits<8>opc, RegisterClass RC, ValueType Ty,
 
 // Special RR multiclass for MRG instruction.
 //   e.g. MRG
-let Constraints = "$sx = $sd", DisableEncoding = "$sd", hasSideEffects = 0 in
+let Constraints = "$sx = $sd", hasSideEffects = 0 in
 multiclass RRMRGm<string opcStr, bits<8>opc, RegisterClass RC> {
   def rr : RR<opc, (outs RC:$sx), (ins RC:$sy, RC:$sz, RC:$sd),
               !strconcat(opcStr, " $sx, $sy, $sz")>;
@@ -722,7 +724,7 @@ multiclass RRSWPm<string opcStr, bits<8>opc,
 
 // Multiclass for CMOV instructions.
 //   e.g. CMOVL, CMOVW, CMOVD, and etc.
-let Constraints = "$sx = $sd", DisableEncoding = "$sd", hasSideEffects = 0,
+let Constraints = "$sx = $sd", hasSideEffects = 0,
     cfw = ? in
 multiclass RRCMOVm<string opcStr, bits<8>opc, RegisterClass RC, ValueType Ty,
                    SDPatternOperator OpNode = null_frag,
@@ -805,7 +807,7 @@ multiclass PFCHm<string opcStr, bits<8>opc> {
 
 // Multiclass for CAS instructions.
 //   e.g. TS1AML, TS1AMW, TS2AM, and etc.
-let Constraints = "$sx = $sd", DisableEncoding = "$sd",
+let Constraints = "$sx = $sd",
     mayStore=1, mayLoad = 1, hasSideEffects = 0 in
 multiclass RRCAStgm<string opcStr, bits<8>opc, RegisterClass RC, ValueType Ty,
                     Operand immOp, Operand MEM, ComplexPattern ADDR,
@@ -920,7 +922,7 @@ multiclass STORECRm<string opcStr, bits<8>opc, RegisterClass RC> {
                !strconcat(opcStr, " $sx, $sy, $sz")>;
 }
 
-let hasSideEffects = 1, Constraints = "$sx = $sx_in", DisableEncoding = "$sx_in" in
+let hasSideEffects = 1, Constraints = "$sx = $sx_in" in
 multiclass TSCRm<string opcStr, bits<8>opc, RegisterClass RC> {
   def rrr : RR<opc, (outs RC:$sx), (ins RC:$sy, RC:$sz, RC:$sx_in),
                !strconcat(opcStr, " $sx, $sy, $sz")>;
diff --git a/llvm/lib/Target/VE/VEInstrVec.td b/llvm/lib/Target/VE/VEInstrVec.td
index 327ad9ceacc5..e0989bf6ad23 100644
--- a/llvm/lib/Target/VE/VEInstrVec.td
+++ b/llvm/lib/Target/VE/VEInstrVec.td
@@ -35,7 +35,7 @@ def STVM512rii : Pseudo<
 
 // LVM/SVM instructions using VM512
 let hasSideEffects = 0, isCodeGenOnly = 1 in {
-  let Constraints = "$vx = $vd", DisableEncoding = "$vd" in {
+  let Constraints = "$vx = $vd" in {
     def LVMyir_y : Pseudo<(outs VM512:$vx), (ins uimm3:$sy, I64:$sz, VM512:$vd),
                           "# pseudo LVM $vx, $sy, $sz, $vd">;
     def LVMyim_y : Pseudo<(outs VM512:$vx),
@@ -51,7 +51,7 @@ let hasSideEffects = 0, isCodeGenOnly = 1 in {
 }
 
 // VFMK/VFMKW/VFMKS instructions using VM512
-let hasSideEffects = 0, isCodeGenOnly = 1, DisableEncoding = "$vl" in {
+let hasSideEffects = 0, isCodeGenOnly = 1 in {
   def VFMKyal : Pseudo<(outs VM512:$vmx), (ins I32:$vl),
                        "# pseudo-vfmk.at $vmx">;
   def VFMKynal : Pseudo<(outs VM512:$vmx), (ins I32:$vl),
@@ -126,21 +126,18 @@ let hasSideEffects = 0, isCodeGenOnly = 1 in {
 
 // Multiclass for VLD instructions
 let mayLoad = 1, hasSideEffects = 0, Uses = [VL] in
-multiclass VLDbm<string opcStr, bits<8>opc, RegisterClass RC, dag dag_in,
-                 string disEnc = ""> {
-  let DisableEncoding = disEnc in
+multiclass VLDbm<string opcStr, bits<8>opc, RegisterClass RC, dag dag_in> {
   def "" : RVM<opc, (outs RC:$vx), dag_in,
                !strconcat(opcStr, " $vx, $sy, $sz")>;
-  let Constraints = "$vx = $base", DisableEncoding = disEnc#"$base",
-      isCodeGenOnly = 1 in
+  let Constraints = "$vx = $base", isCodeGenOnly = 1 in
   def _v : RVM<opc, (outs RC:$vx), !con(dag_in, (ins RC:$base)),
                !strconcat(opcStr, " $vx, $sy, $sz")>;
 }
 multiclass VLDlm<string opcStr, bits<8>opc, RegisterClass RC, dag dag_in> {
   defm "" : VLDbm<opcStr, opc, RC, dag_in>;
   let isCodeGenOnly = 1, VE_VLInUse = 1 in {
-    defm l : VLDbm<opcStr, opc, RC, !con(dag_in, (ins I32:$vl)), "$vl,">;
-    defm L : VLDbm<opcStr, opc, RC, !con(dag_in, (ins VLS:$vl)), "$vl,">;
+    defm l : VLDbm<opcStr, opc, RC, !con(dag_in, (ins I32:$vl))>;
+    defm L : VLDbm<opcStr, opc, RC, !con(dag_in, (ins VLS:$vl))>;
   }
 }
 let VE_VLIndex = 3 in
@@ -182,7 +179,7 @@ let cx = 1 in defm VLDL2DZX : VLDm<"vldl2d.zx", 0xc3, V64>;
 let mayStore = 1, hasSideEffects = 0, Uses = [VL] in
 multiclass VSTbm<string opcStr, string argStr, bits<8>opc, dag dag_in> {
   def "" : RVM<opc, (outs), dag_in, !strconcat(opcStr, argStr)>;
-  let DisableEncoding = "$vl", isCodeGenOnly = 1, VE_VLInUse = 1 in {
+  let isCodeGenOnly = 1, VE_VLInUse = 1 in {
     def l : RVM<opc, (outs), !con(dag_in, (ins I32:$vl)),
                 !strconcat(opcStr, argStr)>;
     def L : RVM<opc, (outs), !con(dag_in, (ins VLS:$vl)),
@@ -232,12 +229,10 @@ defm VSTL2D : VSTm<"vstl2d", 0xd3, V64>;
 // Multiclass for VGT instructions
 let mayLoad = 1, hasSideEffects = 0, Uses = [VL] in
 multiclass VGTbm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
-                 dag dag_in, string disEnc = ""> {
-  let DisableEncoding = disEnc in
+                 dag dag_in> {
   def "" : RVM<opc, (outs RC:$vx), dag_in,
                !strconcat(opcStr, " $vx, ", argStr)>;
-  let Constraints = "$vx = $base", DisableEncoding = disEnc#"$base",
-      isCodeGenOnly = 1 in
+  let Constraints = "$vx = $base", isCodeGenOnly = 1 in
   def _v : RVM<opc, (outs RC:$vx), !con(dag_in, (ins RC:$base)),
                !strconcat(opcStr, " $vx, ", argStr)>;
 }
@@ -245,10 +240,8 @@ multiclass VGTlm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
                  dag dag_in> {
   defm "" : VGTbm<opcStr, argStr, opc, RC, dag_in>;
   let isCodeGenOnly = 1, VE_VLInUse = 1 in {
-    defm l : VGTbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl)),
-                   "$vl,">;
-    defm L : VGTbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl)),
-                   "$vl,">;
+    defm l : VGTbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl))>;
+    defm L : VGTbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl))>;
   }
 }
 multiclass VGTmm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
@@ -297,7 +290,7 @@ def : MnemonicAlias<"vgtl.nc", "vgtl.zx.nc">;
 let mayStore = 1, hasSideEffects = 0, Uses = [VL] in
 multiclass VSCbm<string opcStr, string argStr, bits<8>opc, dag dag_in> {
   def "" : RVM<opc, (outs), dag_in, !strconcat(opcStr, argStr)>;
-  let DisableEncoding = "$vl", isCodeGenOnly = 1, VE_VLInUse = 1 in {
+  let isCodeGenOnly = 1, VE_VLInUse = 1 in {
     def l : RVM<opc, (outs), !con(dag_in, (ins I32:$vl)),
                 !strconcat(opcStr, argStr)>;
     def L : RVM<opc, (outs), !con(dag_in, (ins VLS:$vl)),
@@ -348,7 +341,7 @@ defm VSCL : VSCm<"vscl", 0xb3, V64>;
 let Uses = [VL] in
 multiclass PFCHVbm<string opcStr, string argStr, bits<8>opc, dag dag_in> {
   def "" : RVM<opc, (outs), dag_in, !strconcat(opcStr, argStr)>;
-  let DisableEncoding = "$vl", isCodeGenOnly = 1, VE_VLInUse = 1 in {
+  let isCodeGenOnly = 1, VE_VLInUse = 1 in {
     def l : RVM<opc, (outs), !con(dag_in, (ins I32:$vl)),
                 !strconcat(opcStr, argStr)>;
     def L : RVM<opc, (outs), !con(dag_in, (ins VLS:$vl)),
@@ -373,8 +366,7 @@ let sx = 0, vx = ?, hasSideEffects = 0 in
 multiclass LSVbm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
                  dag dag_in> {
   def "" : RR<opc, (outs RC:$vx), dag_in, !strconcat(opcStr, " ${vx}", argStr)>;
-  let Constraints = "$vx = $base", DisableEncoding = "$base",
-      isCodeGenOnly = 1 in
+  let Constraints = "$vx = $base", isCodeGenOnly = 1 in
   def _v : RR<opc, (outs RC:$vx), !con(dag_in, (ins RC:$base)),
                !strconcat(opcStr, " ${vx}", argStr)>;
 }
@@ -406,8 +398,7 @@ multiclass LVMbm<string opcStr, string argStr, bits<8>opc, RegisterClass RCM,
                  dag dag_in> {
   def "" : RR<opc, (outs RCM:$vx), dag_in,
               !strconcat(opcStr, " $vx, ", argStr)>;
-  let Constraints = "$vx = $base", DisableEncoding = "$base",
-      isCodeGenOnly = 1 in {
+  let Constraints = "$vx = $base", isCodeGenOnly = 1 in {
     def _m : RR<opc, (outs RCM:$vx), !con(dag_in, (ins RCM:$base)),
                 !strconcat(opcStr, " $vx, ", argStr)>;
   }
@@ -440,11 +431,10 @@ defm SVM : SVMm<"svm", 0xa7, VM>;
 // Section 8.9.24 - VBRD (Vector Broadcast)
 let vx = ?, hasSideEffects = 0, Uses = [VL] in
 multiclass VBRDbm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
-                  dag dag_in, string disEnc = ""> {
-  let DisableEncoding = disEnc in
+                  dag dag_in> {
   def "" : RV<opc, (outs RC:$vx), dag_in,
               !strconcat(opcStr, " $vx, ", argStr)>;
-  let Constraints = "$vx = $base", DisableEncoding = disEnc#"$base",
+  let Constraints = "$vx = $base",
       isCodeGenOnly = 1 in
   def _v : RV<opc, (outs RC:$vx), !con(dag_in, (ins RC:$base)),
               !strconcat(opcStr, " $vx, ", argStr)>;
@@ -453,10 +443,8 @@ multiclass VBRDlm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
                   dag dag_in> {
   defm "" : VBRDbm<opcStr, argStr, opc, RC, dag_in>;
   let isCodeGenOnly = 1, VE_VLInUse = 1 in {
-    defm l : VBRDbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl)),
-                   "$vl,">;
-    defm L : VBRDbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl)),
-                   "$vl,">;
+    defm l : VBRDbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl))>;
+    defm L : VBRDbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl))>;
   }
 }
 multiclass VBRDmm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
@@ -484,11 +472,10 @@ defm PVBRD : VBRDm<"pvbrd", 0x8c, V64, I64, VM512>;
 // Section 8.9.25 - VMV (Vector Move)
 let vx = ?, vz = ?, hasSideEffects = 0, Uses = [VL] in
 multiclass VMVbm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
-                 dag dag_in, string disEnc = ""> {
-  let DisableEncoding = disEnc in
+                 dag dag_in> {
   def "" : RV<opc, (outs RC:$vx), dag_in,
               !strconcat(opcStr, " $vx, ", argStr)>;
-  let Constraints = "$vx = $base", DisableEncoding = disEnc#"$base",
+  let Constraints = "$vx = $base",
       isCodeGenOnly = 1 in
   def _v : RV<opc, (outs RC:$vx), !con(dag_in, (ins RC:$base)),
               !strconcat(opcStr, " $vx, ", argStr)>;
@@ -497,10 +484,8 @@ multiclass VMVlm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
                  dag dag_in> {
   defm "" : VMVbm<opcStr, argStr, opc, RC, dag_in>;
   let isCodeGenOnly = 1, VE_VLInUse = 1 in {
-    defm l : VMVbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl)),
-                   "$vl,">;
-    defm L : VMVbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl)),
-                   "$vl,">;
+    defm l : VMVbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl))>;
+    defm L : VMVbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl))>;
   }
 }
 multiclass VMVmm<string opcStr, bits<8>opc, RegisterClass RC,
@@ -525,12 +510,10 @@ defm VMV : VMVm<"vmv", 0x9c, V64, VM>;
 // Multiclass for generic vector calculation
 let vx = ?, hasSideEffects = 0, Uses = [VL] in
 multiclass RVbm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
-                dag dag_in, string disEnc = ""> {
-  let DisableEncoding = disEnc in
+                dag dag_in> {
   def "" : RV<opc, (outs RC:$vx), dag_in,
               !strconcat(opcStr, " $vx", argStr)>;
-  let Constraints = "$vx = $base", DisableEncoding = disEnc#"$base",
-      isCodeGenOnly = 1 in
+  let Constraints = "$vx = $base", isCodeGenOnly = 1 in
   def _v : RV<opc, (outs RC:$vx), !con(dag_in, (ins RC:$base)),
               !strconcat(opcStr, " $vx", argStr)>;
 }
@@ -538,10 +521,8 @@ multiclass RVlm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
                 dag dag_in> {
   defm "" : RVbm<opcStr, argStr, opc, RC, dag_in>;
   let isCodeGenOnly = 1, VE_VLInUse = 1 in {
-    defm l : RVbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl)),
-                  "$vl,">;
-    defm L : RVbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl)),
-                  "$vl,">;
+    defm l : RVbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl))>;
+    defm L : RVbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl))>;
   }
 }
 multiclass RVmm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
@@ -688,11 +669,10 @@ multiclass RVFIXm<string opcStr, bits<8> opc, RegisterClass RC,
 // Multiclass for generic iterative vector calculation
 let vx = ?, hasSideEffects = 0, Uses = [VL] in
 multiclass RVIbm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
-                dag dag_in, string disEnc = ""> {
-  let DisableEncoding = disEnc in
+                dag dag_in> {
   def "" : RV<opc, (outs RC:$vx), dag_in,
               !strconcat(opcStr, " $vx", argStr)>;
-  let isCodeGenOnly = 1, Constraints = "$vx = $base", DisableEncoding = disEnc#"$base" in
+  let isCodeGenOnly = 1, Constraints = "$vx = $base" in
   def _v : RV<opc, (outs RC:$vx), !con(dag_in, (ins RC:$base)),
               !strconcat(opcStr, " $vx", argStr)>;
 }
@@ -700,10 +680,8 @@ multiclass RVIlm<string opcStr, string argStr, bits<8>opc, RegisterClass RC,
                  dag dag_in> {
   defm "" : RVIbm<opcStr, argStr, opc, RC, dag_in>;
   let isCodeGenOnly = 1, VE_VLInUse = 1 in {
-    defm l : RVIbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl)),
-                   "$vl,">;
-    defm L : RVIbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl)),
-                   "$vl,">;
+    defm l : RVIbm<opcStr, argStr, opc, RC, !con(dag_in, (ins I32:$vl))>;
+    defm L : RVIbm<opcStr, argStr, opc, RC, !con(dag_in, (ins VLS:$vl))>;
   }
 }
 // Generic RV multiclass for iterative operation with 2 argument.
@@ -743,7 +721,7 @@ let vx = ?, hasSideEffects = 0, Uses = [VL] in
 multiclass RVMKbm<string opcStr, string argStr, bits<8>opc, dag dag_out,
                   dag dag_in> {
   def "" : RV<opc, dag_out, dag_in, !strconcat(opcStr, argStr)>;
-  let DisableEncoding = "$vl", isCodeGenOnly = 1, VE_VLInUse = 1 in {
+  let isCodeGenOnly = 1, VE_VLInUse = 1 in {
     def l : RV<opc, dag_out, !con(dag_in, (ins I32:$vl)),
                !strconcat(opcStr, argStr)>;
     def L : RV<opc, dag_out, !con(dag_in, (ins VLS:$vl)),
@@ -796,7 +774,7 @@ multiclass RVMSbm<string opcStr, string argStr, bits<8>opc, dag dag_in> {
     bits<7> sx;
     let Inst{54-48} = sx;
   }
-  let DisableEncoding = "$vl", isCodeGenOnly = 1, VE_VLInUse = 1 in {
+  let isCodeGenOnly = 1, VE_VLInUse = 1 in {
     def l : RV<opc, (outs I64:$sx), !con(dag_in, (ins I32:$vl)),
                !strconcat(opcStr, " $sx,", argStr)> {
       bits<7> sx;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
index 378ef2c8f250..1eae3586d16b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
@@ -27,6 +27,7 @@ HANDLE_NODETYPE(WrapperREL)
 HANDLE_NODETYPE(BR_IF)
 HANDLE_NODETYPE(BR_TABLE)
 HANDLE_NODETYPE(DOT)
+HANDLE_NODETYPE(EXT_ADD_PAIRWISE_U)
 HANDLE_NODETYPE(SHUFFLE)
 HANDLE_NODETYPE(SWIZZLE)
 HANDLE_NODETYPE(VEC_SHL)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index c6c2d0cfccb6..fe100dab427e 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -2183,13 +2183,10 @@ SDValue performLowerPartialReduction(SDNode *N, SelectionDAG &DAG) {
     SDValue MulLow = DAG.getNode(ISD::MUL, DL, MVT::v8i16, LowLHS, LowRHS);
     SDValue MulHigh = DAG.getNode(ISD::MUL, DL, MVT::v8i16, HighLHS, HighRHS);
 
-    SDValue LowLow = DAG.getNode(LowOpc, DL, MVT::v4i32, MulLow);
-    SDValue LowHigh = DAG.getNode(LowOpc, DL, MVT::v4i32, MulHigh);
-    SDValue HighLow = DAG.getNode(HighOpc, DL, MVT::v4i32, MulLow);
-    SDValue HighHigh = DAG.getNode(HighOpc, DL, MVT::v4i32, MulHigh);
-
-    SDValue AddLow = DAG.getNode(ISD::ADD, DL, MVT::v4i32, LowLow, HighLow);
-    SDValue AddHigh = DAG.getNode(ISD::ADD, DL, MVT::v4i32, LowHigh, HighHigh);
+    SDValue AddLow =
+        DAG.getNode(WebAssemblyISD::EXT_ADD_PAIRWISE_U, DL, MVT::v4i32, MulLow);
+    SDValue AddHigh = DAG.getNode(WebAssemblyISD::EXT_ADD_PAIRWISE_U, DL,
+                                  MVT::v4i32, MulHigh);
     SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::v4i32, AddLow, AddHigh);
     return DAG.getNode(ISD::ADD, DL, MVT::v4i32, N->getOperand(1), Add);
   }
@@ -3588,34 +3585,53 @@ static SDValue performMulCombine(SDNode *N,
   if (auto Res = TryWideExtMulCombine(N, DCI.DAG))
     return Res;
 
-  // We don't natively support v16i8 mul, but we do support v8i16 so split the
-  // inputs and extend them to v8i16. Only do this before legalization in case
-  // a narrow vector is widened and may be simplified later.
-  if (!DCI.isBeforeLegalize() || VT != MVT::v16i8)
+  // We don't natively support v16i8 or v8i8 mul, but we do support v8i16. So,
+  // extend them to v8i16. Only do this before legalization in case a narrow
+  // vector is widened and may be simplified later.
+  if (!DCI.isBeforeLegalize() || (VT != MVT::v8i8 && VT != MVT::v16i8))
     return SDValue();
 
   SDLoc DL(N);
   SelectionDAG &DAG = DCI.DAG;
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
-  SDValue LowLHS =
-      DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MVT::v8i16, LHS);
-  SDValue HighLHS =
-      DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MVT::v8i16, LHS);
-  SDValue LowRHS =
-      DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MVT::v8i16, RHS);
-  SDValue HighRHS =
-      DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MVT::v8i16, RHS);
-
-  SDValue MulLow =
-      DAG.getBitcast(VT, DAG.getNode(ISD::MUL, DL, MVT::v8i16, LowLHS, LowRHS));
-  SDValue MulHigh = DAG.getBitcast(
-      VT, DAG.getNode(ISD::MUL, DL, MVT::v8i16, HighLHS, HighRHS));
-
-  // Take the low byte of each lane.
-  return DAG.getVectorShuffle(
-      VT, DL, MulLow, MulHigh,
-      {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30});
+  EVT MulVT = MVT::v8i16;
+
+  if (VT == MVT::v8i8) {
+    SDValue PromotedLHS = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, LHS,
+                                      DAG.getUNDEF(MVT::v8i8));
+    SDValue PromotedRHS = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, RHS,
+                                      DAG.getUNDEF(MVT::v8i8));
+    SDValue LowLHS =
+        DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MulVT, PromotedLHS);
+    SDValue LowRHS =
+        DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MulVT, PromotedRHS);
+    SDValue MulLow = DAG.getBitcast(
+        MVT::v16i8, DAG.getNode(ISD::MUL, DL, MulVT, LowLHS, LowRHS));
+    // Take the low byte of each lane.
+    SDValue Shuffle = DAG.getVectorShuffle(
+        MVT::v16i8, DL, MulLow, DAG.getUNDEF(MVT::v16i8),
+        {0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1});
+    return extractSubVector(Shuffle, 0, DAG, DL, 64);
+  } else {
+    assert(VT == MVT::v16i8 && "Expected v16i8");
+    SDValue LowLHS = DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MulVT, LHS);
+    SDValue LowRHS = DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MulVT, RHS);
+    SDValue HighLHS =
+        DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MulVT, LHS);
+    SDValue HighRHS =
+        DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MulVT, RHS);
+
+    SDValue MulLow =
+        DAG.getBitcast(VT, DAG.getNode(ISD::MUL, DL, MulVT, LowLHS, LowRHS));
+    SDValue MulHigh =
+        DAG.getBitcast(VT, DAG.getNode(ISD::MUL, DL, MulVT, HighLHS, HighRHS));
+
+    // Take the low byte of each lane.
+    return DAG.getVectorShuffle(
+        VT, DL, MulLow, MulHigh,
+        {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30});
+  }
 }
 
 SDValue
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
index a934853ff9f4..feac04a17068 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
@@ -34,7 +34,7 @@ using namespace llvm;
 #include "WebAssemblyGenInstrInfo.inc"
 
 WebAssemblyInstrInfo::WebAssemblyInstrInfo(const WebAssemblySubtarget &STI)
-    : WebAssemblyGenInstrInfo(WebAssembly::ADJCALLSTACKDOWN,
+    : WebAssemblyGenInstrInfo(STI, WebAssembly::ADJCALLSTACKDOWN,
                               WebAssembly::ADJCALLSTACKUP,
                               WebAssembly::CATCHRET),
       RI(STI.getTargetTriple()) {}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index f06f8d5174e3..3c26b453c448 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1453,15 +1453,22 @@ if !ne(t1, t2) then
 def : Pat<(t1.vt (bitconvert (t2.vt V128:$v))), (t1.vt V128:$v)>;
 
 // Extended pairwise addition
+def extadd_pairwise_u : SDNode<"WebAssemblyISD::EXT_ADD_PAIRWISE_U", extend_t>;
+
 defm "" : SIMDConvert<I16x8, I8x16, int_wasm_extadd_pairwise_signed,
                       "extadd_pairwise_i8x16_s", 0x7c>;
-defm "" : SIMDConvert<I16x8, I8x16, int_wasm_extadd_pairwise_unsigned,
+defm "" : SIMDConvert<I16x8, I8x16, extadd_pairwise_u,
                       "extadd_pairwise_i8x16_u", 0x7d>;
 defm "" : SIMDConvert<I32x4, I16x8, int_wasm_extadd_pairwise_signed,
                       "extadd_pairwise_i16x8_s", 0x7e>;
-defm "" : SIMDConvert<I32x4, I16x8, int_wasm_extadd_pairwise_unsigned,
+defm "" : SIMDConvert<I32x4, I16x8, extadd_pairwise_u,
                       "extadd_pairwise_i16x8_u", 0x7f>;
 
+def : Pat<(v4i32 (int_wasm_extadd_pairwise_unsigned (v8i16 V128:$in))),
+          (extadd_pairwise_u_I32x4 V128:$in)>;
+def : Pat<(v8i16 (int_wasm_extadd_pairwise_unsigned (v16i8 V128:$in))),
+          (extadd_pairwise_u_I16x8 V128:$in)>;
+
 // f64x2 <-> f32x4 conversions
 def demote_t : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>;
 def demote_zero : SDNode<"WebAssemblyISD::DEMOTE_ZERO", demote_t>;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index bc91c6424b63..08ca20b5eef6 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -247,7 +247,8 @@ static void query(const MachineInstr &MI, bool &Read, bool &Write,
   // Check for writes to __stack_pointer global.
   if ((MI.getOpcode() == WebAssembly::GLOBAL_SET_I32 ||
        MI.getOpcode() == WebAssembly::GLOBAL_SET_I64) &&
-      strcmp(MI.getOperand(0).getSymbolName(), "__stack_pointer") == 0)
+      MI.getOperand(0).isSymbol() &&
+      !strcmp(MI.getOperand(0).getSymbolName(), "__stack_pointer"))
     StackPointer = true;
 
   // Analyze calls.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index 08fb7586d215..0eefd3e2b350 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -166,12 +166,6 @@ InstructionCost WebAssemblyTTIImpl::getMemoryOpCost(
                                   CostKind);
   }
 
-  int ISD = TLI->InstructionOpcodeToISD(Opcode);
-  if (ISD != ISD::LOAD) {
-    return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
-                                  CostKind);
-  }
-
   EVT VT = TLI->getValueType(DL, Ty, true);
   // Type legalization can't handle structs
   if (VT == MVT::Other)
@@ -182,22 +176,121 @@ InstructionCost WebAssemblyTTIImpl::getMemoryOpCost(
   if (!LT.first.isValid())
     return InstructionCost::getInvalid();
 
-  // 128-bit loads are a single instruction. 32-bit and 64-bit vector loads can
-  // be lowered to load32_zero and load64_zero respectively. Assume SIMD loads
-  // are twice as expensive as scalar.
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
   unsigned width = VT.getSizeInBits();
-  switch (width) {
-  default:
-    break;
-  case 32:
-  case 64:
-  case 128:
-    return 2;
+  if (ISD == ISD::LOAD) {
+    // 128-bit loads are a single instruction. 32-bit and 64-bit vector loads
+    // can be lowered to load32_zero and load64_zero respectively. Assume SIMD
+    // loads are twice as expensive as scalar.
+    switch (width) {
+    default:
+      break;
+    case 32:
+    case 64:
+    case 128:
+      return 2;
+    }
+  } else if (ISD == ISD::STORE) {
+    // For stores, we can use store lane operations.
+    switch (width) {
+    default:
+      break;
+    case 8:
+    case 16:
+    case 32:
+    case 64:
+    case 128:
+      return 2;
+    }
   }
 
   return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, CostKind);
 }
 
+InstructionCost WebAssemblyTTIImpl::getInterleavedMemoryOpCost(
+    unsigned Opcode, Type *Ty, unsigned Factor, ArrayRef<unsigned> Indices,
+    Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
+    bool UseMaskForCond, bool UseMaskForGaps) const {
+  assert(Factor >= 2 && "Invalid interleave factor");
+
+  auto *VecTy = cast<VectorType>(Ty);
+  if (!ST->hasSIMD128() || !isa<FixedVectorType>(VecTy)) {
+    return InstructionCost::getInvalid();
+  }
+
+  if (UseMaskForCond || UseMaskForGaps)
+    return BaseT::getInterleavedMemoryOpCost(Opcode, Ty, Factor, Indices,
+                                             Alignment, AddressSpace, CostKind,
+                                             UseMaskForCond, UseMaskForGaps);
+
+  constexpr unsigned MaxInterleaveFactor = 4;
+  if (Factor <= MaxInterleaveFactor) {
+    unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
+    // Ensure the number of vector elements is greater than 1.
+    if (MinElts < 2 || MinElts % Factor != 0)
+      return InstructionCost::getInvalid();
+
+    unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
+    // Ensure the element type is legal.
+    if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
+      return InstructionCost::getInvalid();
+
+    auto *SubVecTy =
+        VectorType::get(VecTy->getElementType(),
+                        VecTy->getElementCount().divideCoefficientBy(Factor));
+    InstructionCost MemCost =
+        getMemoryOpCost(Opcode, SubVecTy, Alignment, AddressSpace, CostKind);
+
+    unsigned VecSize = DL.getTypeSizeInBits(SubVecTy);
+    unsigned MaxVecSize = 128;
+    unsigned NumAccesses =
+        std::max<unsigned>(1, (MinElts * ElSize + MaxVecSize - 1) / VecSize);
+
+    // A stride of two is commonly supported via dedicated instructions, so it
+    // should be relatively cheap for all element sizes. A stride of four is
+    // more expensive as it will likely require more shuffles. Using two
+    // simd128 inputs is considered more expensive and we mainly account for
+    // shuffling two inputs (32 bytes), but we do model 4 x v4i32 to enable
+    // arithmetic kernels.
+    static const CostTblEntry ShuffleCostTbl[] = {
+        // One reg.
+        {2, MVT::v2i8, 1},  // interleave 2 x 2i8 into 4i8
+        {2, MVT::v4i8, 1},  // interleave 2 x 4i8 into 8i8
+        {2, MVT::v8i8, 1},  // interleave 2 x 8i8 into 16i8
+        {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16
+        {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16
+        {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32
+
+        // Two regs.
+        {2, MVT::v16i8, 2}, // interleave 2 x 16i8 into 32i8
+        {2, MVT::v8i16, 2}, // interleave 2 x 8i16 into 16i16
+        {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32
+
+        // One reg.
+        {4, MVT::v2i8, 4},  // interleave 4 x 2i8 into 8i8
+        {4, MVT::v4i8, 4},  // interleave 4 x 4i8 into 16i8
+        {4, MVT::v2i16, 4}, // interleave 4 x 2i16 into 8i16
+
+        // Two regs.
+        {4, MVT::v8i8, 16}, // interleave 4 x 8i8 into 32i8
+        {4, MVT::v4i16, 8}, // interleave 4 x 4i16 into 16i16
+        {4, MVT::v2i32, 4}, // interleave 4 x 2i32 into 8i32
+
+        // Four regs.
+        {4, MVT::v4i32, 16}, // interleave 4 x 4i32 into 16i32
+    };
+
+    EVT ETy = TLI->getValueType(DL, SubVecTy);
+    if (const auto *Entry =
+            CostTableLookup(ShuffleCostTbl, Factor, ETy.getSimpleVT()))
+      return Entry->Cost + (NumAccesses * MemCost);
+  }
+
+  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                           Alignment, AddressSpace, CostKind,
+                                           UseMaskForCond, UseMaskForGaps);
+}
+
 InstructionCost WebAssemblyTTIImpl::getVectorInstrCost(
     unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
     const Value *Op0, const Value *Op1) const {
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index c915eeb07d4f..2573066cd5d6 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -82,6 +82,10 @@ public:
       TTI::TargetCostKind CostKind,
       TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
       const Instruction *I = nullptr) const override;
+  InstructionCost getInterleavedMemoryOpCost(
+      unsigned Opcode, Type *Ty, unsigned Factor, ArrayRef<unsigned> Indices,
+      Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
+      bool UseMaskForCond, bool UseMaskForGaps) const override;
   using BaseT::getVectorInstrCost;
   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
                                      TTI::TargetCostKind CostKind,
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index d7671ed19589..ce5e92135f70 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -15,10 +15,12 @@
 #include "MCTargetDesc/X86TargetStreamer.h"
 #include "TargetInfo/X86TargetInfo.h"
 #include "X86Operand.h"
+#include "X86RegisterInfo.h"
 #include "llvm-c/Visibility.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCContext.h"
@@ -29,6 +31,7 @@
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCRegister.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
@@ -40,6 +43,7 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
+#include <cstdint>
 #include <memory>
 
 using namespace llvm;
@@ -1172,7 +1176,7 @@ private:
 
   X86::CondCode ParseConditionCode(StringRef CCode);
 
-  bool ParseIntelMemoryOperandSize(unsigned &Size);
+  bool ParseIntelMemoryOperandSize(unsigned &Size, StringRef *SizeStr);
   bool CreateMemForMSInlineAsm(MCRegister SegReg, const MCExpr *Disp,
                                MCRegister BaseReg, MCRegister IndexReg,
                                unsigned Scale, bool NonAbsMem, SMLoc Start,
@@ -2574,7 +2578,8 @@ bool X86AsmParser::ParseMasmOperator(unsigned OpKind, int64_t &Val) {
   return false;
 }
 
-bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size) {
+bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size,
+                                               StringRef *SizeStr) {
   Size = StringSwitch<unsigned>(getTok().getString())
     .Cases("BYTE", "byte", 8)
     .Cases("WORD", "word", 16)
@@ -2592,6 +2597,8 @@ bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size) {
     .Cases("ZMMWORD", "zmmword", 512)
     .Default(0);
   if (Size) {
+    if (SizeStr)
+      *SizeStr = getTok().getString();
     const AsmToken &Tok = Lex(); // Eat operand size (e.g., byte, word).
     if (!(Tok.getString() == "PTR" || Tok.getString() == "ptr"))
       return Error(Tok.getLoc(), "Expected 'PTR' or 'ptr' token!");
@@ -2600,6 +2607,19 @@ bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size) {
   return false;
 }
 
+uint16_t RegSizeInBits(const MCRegisterInfo &MRI, MCRegister RegNo) {
+  if (X86MCRegisterClasses[X86::GR8RegClassID].contains(RegNo))
+    return 8;
+  if (X86MCRegisterClasses[X86::GR16RegClassID].contains(RegNo))
+    return 16;
+  if (X86MCRegisterClasses[X86::GR32RegClassID].contains(RegNo))
+    return 32;
+  if (X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo))
+    return 64;
+  // Unknown register size
+  return 0;
+}
+
 bool X86AsmParser::parseIntelOperand(OperandVector &Operands, StringRef Name) {
   MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
@@ -2607,7 +2627,8 @@ bool X86AsmParser::parseIntelOperand(OperandVector &Operands, StringRef Name) {
 
   // Parse optional Size directive.
   unsigned Size;
-  if (ParseIntelMemoryOperandSize(Size))
+  StringRef SizeStr;
+  if (ParseIntelMemoryOperandSize(Size, &SizeStr))
     return true;
   bool PtrInOperand = bool(Size);
 
@@ -2624,9 +2645,29 @@ bool X86AsmParser::parseIntelOperand(OperandVector &Operands, StringRef Name) {
       return Error(Start, "rip can only be used as a base register");
     // A Register followed by ':' is considered a segment override
     if (Tok.isNot(AsmToken::Colon)) {
-      if (PtrInOperand)
-        return Error(Start, "expected memory operand after 'ptr', "
-                            "found register operand instead");
+      if (PtrInOperand) {
+        if (!Parser.isParsingMasm())
+          return Error(Start, "expected memory operand after 'ptr', "
+                              "found register operand instead");
+
+        // If we are parsing MASM, we are allowed to cast registers to their own
+        // sizes, but not to other types.
+        uint16_t RegSize =
+            RegSizeInBits(*getContext().getRegisterInfo(), RegNo);
+        if (RegSize == 0)
+          return Error(
+              Start,
+              "cannot cast register '" +
+                  StringRef(getContext().getRegisterInfo()->getName(RegNo)) +
+                  "'; its size is not easily defined.");
+        if (RegSize != Size)
+          return Error(
+              Start,
+              std::to_string(RegSize) + "-bit register '" +
+                  StringRef(getContext().getRegisterInfo()->getName(RegNo)) +
+                  "' cannot be used as a " + std::to_string(Size) + "-bit " +
+                  SizeStr.upper());
+      }
       Operands.push_back(X86Operand::CreateReg(RegNo, Start, End));
       return false;
     }
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 56a4cc3d65c2..865fc0ce8101 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -485,7 +485,16 @@ void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS,
   if (!CanPadInst)
     return;
 
-  if (PendingBA && PendingBA->getNext() == OS.getCurrentFragment()) {
+  if (PendingBA) {
+    auto *NextFragment = PendingBA->getNext();
+    assert(NextFragment && "NextFragment should not be null");
+    if (NextFragment == OS.getCurrentFragment())
+      return;
+    // We eagerly create an empty fragment when inserting a fragment
+    // with a variable-size tail.
+    if (NextFragment->getNext() == OS.getCurrentFragment())
+      return;
+
     // Macro fusion actually happens and there is no other fragment inserted
     // after the previous instruction.
     //
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
index 547745fdba9d..76731437931a 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
@@ -1668,6 +1668,13 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
 
+  case X86::VMOVSHZrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DecodeScalarMoveMask(8, false, ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
   case X86::MOVPQI2QIrr:
   case X86::MOVZPQILo2PQIrr:
   case X86::VMOVPQI2QIrr:
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index a15930c1433f..cfe5b1094811 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -1047,9 +1047,6 @@ X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
 
   Prefix.setL(TSFlags & X86II::VEX_L);
   Prefix.setL2(TSFlags & X86II::EVEX_L2);
-  if ((TSFlags & X86II::EVEX_L2) && STI.hasFeature(X86::FeatureAVX512) &&
-      !STI.hasFeature(X86::FeatureEVEX512))
-    report_fatal_error("ZMM registers are not supported without EVEX512");
   switch (TSFlags & X86II::OpPrefixMask) {
   case X86II::PD:
     Prefix.setPP(0x1); // 66
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index cc7bcd678cb3..bb1e716c33ed 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -397,18 +397,6 @@ MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(const Triple &TT,
   if (CPU.empty())
     CPU = "generic";
 
-  size_t posNoEVEX512 = FS.rfind("-evex512");
-  // Make sure we won't be cheated by "-avx512fp16".
-  size_t posNoAVX512F =
-      FS.ends_with("-avx512f") ? FS.size() - 8 : FS.rfind("-avx512f,");
-  size_t posEVEX512 = FS.rfind("+evex512");
-  size_t posAVX512F = FS.rfind("+avx512"); // Any AVX512XXX will enable AVX512F.
-
-  if (posAVX512F != StringRef::npos &&
-      (posNoAVX512F == StringRef::npos || posNoAVX512F < posAVX512F))
-    if (posEVEX512 == StringRef::npos && posNoEVEX512 == StringRef::npos)
-      ArchFS += ",+evex512";
-
   return createX86MCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, ArchFS);
 }
 
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 9cfe081b8710..7c9e821c02fd 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -113,6 +113,7 @@ def FeatureFMA     : SubtargetFeature<"fma", "HasFMA", "true",
 def FeatureF16C    : SubtargetFeature<"f16c", "HasF16C", "true",
                        "Support 16-bit floating point conversion instructions",
                        [FeatureAVX]>;
+// Deprecated feature. Keep it here to suppress warnings in old IRs.
 def FeatureEVEX512  : SubtargetFeature<"evex512", "HasEVEX512", "true",
                         "Support ZMM and 64-bit mask instructions">;
 def FeatureAVX512   : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512",
@@ -329,20 +330,22 @@ def FeatureMOVDIRI  : SubtargetFeature<"movdiri", "HasMOVDIRI", "true",
                                        "Support movdiri instruction (direct store integer)">;
 def FeatureMOVDIR64B : SubtargetFeature<"movdir64b", "HasMOVDIR64B", "true",
                                         "Support movdir64b instruction (direct store 64 bytes)">;
-def FeatureAVX10_1 : SubtargetFeature<"avx10.1-256", "HasAVX10_1", "true",
-                                      "Support AVX10.1 up to 256-bit instruction",
+def FeatureAVX10_1 : SubtargetFeature<"avx10.1", "HasAVX10_1", "true",
+                                      "Support AVX10.1 instruction",
                                       [FeatureCDI, FeatureVBMI, FeatureIFMA, FeatureVNNI,
                                        FeatureBF16, FeatureVPOPCNTDQ, FeatureVBMI2, FeatureBITALG,
                                        FeatureFP16, FeatureVLX, FeatureDQI]>;
+// Deprecated feature. Keep it here to suppress warnings in old IRs.
 def FeatureAVX10_1_512 : SubtargetFeature<"avx10.1-512", "HasAVX10_1_512", "true",
-                                          "Support AVX10.1 up to 512-bit instruction",
-                                          [FeatureAVX10_1, FeatureEVEX512]>;
-def FeatureAVX10_2 : SubtargetFeature<"avx10.2-256", "HasAVX10_2", "true",
-                                      "Support AVX10.2 up to 256-bit instruction",
+                                          "Support AVX10.1 instruction",
+                                          [FeatureAVX10_1]>;
+def FeatureAVX10_2 : SubtargetFeature<"avx10.2", "HasAVX10_2", "true",
+                                      "Support AVX10.2 instruction",
                                       [FeatureAVX10_1]>;
+// Deprecated feature. Keep it here to suppress warnings in old IRs.
 def FeatureAVX10_2_512 : SubtargetFeature<"avx10.2-512", "HasAVX10_2_512", "true",
-                                          "Support AVX10.2 up to 512-bit instruction",
-                                          [FeatureAVX10_2, FeatureAVX10_1_512]>;
+                                          "Support AVX10.2 instruction",
+                                          [FeatureAVX10_2]>;
 def FeatureEGPR : SubtargetFeature<"egpr", "HasEGPR", "true",
                                    "Support extended general purpose register">;
 def FeaturePush2Pop2 : SubtargetFeature<"push2pop2", "HasPush2Pop2", "true",
@@ -871,7 +874,6 @@ def ProcessorFeatures {
   ];
 
   list<SubtargetFeature> X86_64V4Features = !listconcat(X86_64V3Features, [
-    FeatureEVEX512,
     FeatureBWI,
     FeatureCDI,
     FeatureDQI,
@@ -996,7 +998,6 @@ def ProcessorFeatures {
                                                   FeatureXSAVES,
                                                   FeatureCLFLUSHOPT,
                                                   FeatureAVX512,
-                                                  FeatureEVEX512,
                                                   FeatureCDI,
                                                   FeatureDQI,
                                                   FeatureBWI,
@@ -1039,7 +1040,6 @@ def ProcessorFeatures {
 
   // Cannonlake
   list<SubtargetFeature> CNLAdditionalFeatures = [FeatureAVX512,
-                                                  FeatureEVEX512,
                                                   FeatureCDI,
                                                   FeatureDQI,
                                                   FeatureBWI,
@@ -1155,7 +1155,7 @@ def ProcessorFeatures {
     !listconcat(GNRFeatures, GNRDAdditionalFeatures);
 
   // Diamond Rapids
-  list<SubtargetFeature> DMRAdditionalFeatures = [FeatureAVX10_2_512,
+  list<SubtargetFeature> DMRAdditionalFeatures = [FeatureAVX10_2,
                                                   FeatureSM4,
                                                   FeatureCMPCCXADD,
                                                   FeatureAVXIFMA,
@@ -1368,7 +1368,6 @@ def ProcessorFeatures {
                                         FeatureF16C,
                                         FeatureFSGSBase,
                                         FeatureAVX512,
-                                        FeatureEVEX512,
                                         FeatureCDI,
                                         FeatureADX,
                                         FeatureRDSEED,
@@ -1586,7 +1585,6 @@ def ProcessorFeatures {
   list<SubtargetFeature> ZN4Tuning =
     !listconcat(ZN3Tuning, ZN4AdditionalTuning);
   list<SubtargetFeature> ZN4AdditionalFeatures = [FeatureAVX512,
-                                                  FeatureEVEX512,
                                                   FeatureCDI,
                                                   FeatureDQI,
                                                   FeatureBWI,
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.cpp b/llvm/lib/Target/X86/X86AsmPrinter.cpp
index d406277e440b..ff22ee8c86fa 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.cpp
+++ b/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -476,7 +476,8 @@ static bool isIndirectBranchOrTailCall(const MachineInstr &MI) {
   return MI.getDesc().isIndirectBranch() /*Make below code in a good shape*/ ||
          Opc == X86::TAILJMPr || Opc == X86::TAILJMPm ||
          Opc == X86::TAILJMPr64 || Opc == X86::TAILJMPm64 ||
-         Opc == X86::TCRETURNri || Opc == X86::TCRETURNmi ||
+         Opc == X86::TCRETURNri || Opc == X86::TCRETURN_WIN64ri ||
+         Opc == X86::TCRETURN_HIPE32ri || Opc == X86::TCRETURNmi ||
          Opc == X86::TCRETURNri64 || Opc == X86::TCRETURNmi64 ||
          Opc == X86::TCRETURNri64_ImpCall || Opc == X86::TAILJMPr64_REX ||
          Opc == X86::TAILJMPm64_REX;
diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index 0e6b4dffec3a..9457e718de69 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -269,6 +269,8 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
   case X86::TCRETURNdi:
   case X86::TCRETURNdicc:
   case X86::TCRETURNri:
+  case X86::TCRETURN_WIN64ri:
+  case X86::TCRETURN_HIPE32ri:
   case X86::TCRETURNmi:
   case X86::TCRETURNdi64:
   case X86::TCRETURNdi64cc:
@@ -346,8 +348,9 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
       MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op));
       for (unsigned i = 0; i != X86::AddrNumOperands; ++i)
         MIB.add(MBBI->getOperand(i));
-    } else if ((Opcode == X86::TCRETURNri64) ||
-               (Opcode == X86::TCRETURNri64_ImpCall)) {
+    } else if (Opcode == X86::TCRETURNri64 ||
+               Opcode == X86::TCRETURNri64_ImpCall ||
+               Opcode == X86::TCRETURN_WIN64ri) {
       JumpTarget.setIsKill();
       BuildMI(MBB, MBBI, DL,
               TII->get(IsX64 ? X86::TAILJMPr64_REX : X86::TAILJMPr64))
diff --git a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
index d3c239250943..787b71d425cb 100644
--- a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
@@ -564,8 +564,17 @@ bool X86FastPreTileConfig::configBasicBlock(MachineBasicBlock &MBB) {
       MachineBasicBlock::iterator I;
       if (LastShapeMI && dominates(MBB, MI, LastShapeMI))
         I = ++LastShapeMI->getIterator();
-      else
-        I = ++MI.getIterator();
+      else {
+        // Call can overwrite registers like rax, ensure the tile config
+        // instruction is sinked closer to first instruction that uses tile.
+        auto UseIt = MI.getIterator();
+        while (UseIt != MBB.end()) {
+          if (HasTileOperand(MRI, *UseIt))
+            break;
+          ++UseIt;
+        }
+        I = UseIt;
+      }
       Config(*I);
       HasUnconfigTile = false;
       continue;
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index cba7843d53e3..a293b4c87cfe 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -2398,7 +2398,8 @@ X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const {
 }
 
 static bool isTailCallOpcode(unsigned Opc) {
-  return Opc == X86::TCRETURNri || Opc == X86::TCRETURNdi ||
+  return Opc == X86::TCRETURNri || Opc == X86::TCRETURN_WIN64ri ||
+         Opc == X86::TCRETURN_HIPE32ri || Opc == X86::TCRETURNdi ||
          Opc == X86::TCRETURNmi || Opc == X86::TCRETURNri64 ||
          Opc == X86::TCRETURNri64_ImpCall || Opc == X86::TCRETURNdi64 ||
          Opc == X86::TCRETURNmi64;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 19131fbd4102..3631016b0f5c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -326,15 +326,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   if (Subtarget.hasAVX10_2()) {
     setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v2i32, Custom);
     setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v2i32, Custom);
+    setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v8i64, Legal);
+    setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v8i64, Legal);
     for (MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
                    MVT::v4i64}) {
       setOperationAction(ISD::FP_TO_UINT_SAT, VT, Legal);
       setOperationAction(ISD::FP_TO_SINT_SAT, VT, Legal);
     }
-    if (Subtarget.hasAVX10_2_512()) {
-      setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v8i64, Legal);
-      setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v8i64, Legal);
-    }
     if (Subtarget.is64Bit()) {
       setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Legal);
       setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Legal);
@@ -2457,6 +2455,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) {
+    setOperationAction(ISD::FADD, MVT::v32bf16, Legal);
+    setOperationAction(ISD::FSUB, MVT::v32bf16, Legal);
+    setOperationAction(ISD::FMUL, MVT::v32bf16, Legal);
+    setOperationAction(ISD::FDIV, MVT::v32bf16, Legal);
+    setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal);
+    setOperationAction(ISD::FMA, MVT::v32bf16, Legal);
+    setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
+    setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom);
+    setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom);
+    setOperationAction(ISD::FMINIMUMNUM, MVT::v32bf16, Custom);
+    setOperationAction(ISD::FMAXIMUMNUM, MVT::v32bf16, Custom);
     for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
       setOperationAction(ISD::FADD, VT, Legal);
       setOperationAction(ISD::FSUB, VT, Legal);
@@ -2470,19 +2479,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
       setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
     }
-    if (Subtarget.hasAVX10_2_512()) {
-      setOperationAction(ISD::FADD, MVT::v32bf16, Legal);
-      setOperationAction(ISD::FSUB, MVT::v32bf16, Legal);
-      setOperationAction(ISD::FMUL, MVT::v32bf16, Legal);
-      setOperationAction(ISD::FDIV, MVT::v32bf16, Legal);
-      setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal);
-      setOperationAction(ISD::FMA, MVT::v32bf16, Legal);
-      setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
-      setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom);
-      setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom);
-      setOperationAction(ISD::FMINIMUMNUM, MVT::v32bf16, Custom);
-      setOperationAction(ISD::FMAXIMUMNUM, MVT::v32bf16, Custom);
-    }
     for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
       setCondCodeAction(ISD::SETOEQ, VT, Custom);
       setCondCodeAction(ISD::SETUNE, VT, Custom);
@@ -21252,7 +21248,7 @@ static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
   // the truncation then we can use PACKSS by converting the srl to a sra.
   // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
   if (In.getOpcode() == ISD::SRL && In->hasOneUse())
-    if (std::optional<uint64_t> ShAmt = DAG.getValidShiftAmount(In)) {
+    if (std::optional<unsigned> ShAmt = DAG.getValidShiftAmount(In)) {
       if (*ShAmt == MinSignBits) {
         PackOpcode = X86ISD::PACKSS;
         return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops());
@@ -26269,10 +26265,9 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
                                     SDValue PreservedSrc,
                                     const X86Subtarget &Subtarget,
                                     SelectionDAG &DAG) {
-
-  if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
-    if (MaskConst->getZExtValue() & 0x1)
-      return Op;
+  auto *MaskConst = dyn_cast<ConstantSDNode>(Mask);
+  if (MaskConst && (MaskConst->getZExtValue() & 0x1))
+    return Op;
 
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
@@ -26288,6 +26283,17 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
 
   if (PreservedSrc.isUndef())
     PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
+
+  if (MaskConst) {
+    assert((MaskConst->getZExtValue() & 0x1) == 0 && "Expected false mask");
+    // Discard op and blend passthrough with scalar op src/dst.
+    SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
+    std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);
+    ShuffleMask[0] = VT.getVectorNumElements();
+    return DAG.getVectorShuffle(VT, dl, Op.getOperand(0), PreservedSrc,
+                                ShuffleMask);
+  }
+
   return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
 }
 
@@ -31404,9 +31410,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
     return R;
 
   // AVX512 implicitly uses modulo rotation amounts.
-  if ((Subtarget.hasVLX() ||
-       (Subtarget.hasAVX512() && Subtarget.hasEVEX512())) &&
-      32 <= EltSizeInBits) {
+  if ((Subtarget.hasVLX() || Subtarget.hasAVX512()) && 32 <= EltSizeInBits) {
     // Attempt to rotate by immediate.
     if (IsCstSplat) {
       unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
@@ -38676,13 +38680,11 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
 
     Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
     if (Opc == X86ISD::VSHLI) {
-      Known.Zero <<= ShAmt;
-      Known.One <<= ShAmt;
+      Known <<= ShAmt;
       // Low bits are known zero.
       Known.Zero.setLowBits(ShAmt);
     } else if (Opc == X86ISD::VSRLI) {
-      Known.Zero.lshrInPlace(ShAmt);
-      Known.One.lshrInPlace(ShAmt);
+      Known >>= ShAmt;
       // High bits are known zero.
       Known.Zero.setHighBits(ShAmt);
     } else {
@@ -44518,8 +44520,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
                              TLO, Depth + 1))
       return true;
 
-    Known.Zero <<= ShAmt;
-    Known.One <<= ShAmt;
+    Known <<= ShAmt;
 
     // Low bits known zero.
     Known.Zero.setLowBits(ShAmt);
@@ -44549,8 +44550,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
                              TLO, Depth + 1))
       return true;
 
-    Known.Zero.lshrInPlace(ShAmt);
-    Known.One.lshrInPlace(ShAmt);
+    Known >>= ShAmt;
 
     // High bits known zero.
     Known.Zero.setHighBits(ShAmt);
@@ -44598,8 +44598,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
                              TLO, Depth + 1))
       return true;
 
-    Known.Zero.lshrInPlace(ShAmt);
-    Known.One.lshrInPlace(ShAmt);
+    Known >>= ShAmt;
 
     // If the input sign bit is known to be zero, or if none of the top bits
     // are demanded, turn this into an unsigned shift right.
@@ -44957,6 +44956,44 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
     Known.Zero.setLowBits(Known2.countMinTrailingZeros());
     return false;
   }
+  case X86ISD::VPMADD52L:
+  case X86ISD::VPMADD52H: {
+    KnownBits KnownOp0, KnownOp1, KnownOp2;
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+    SDValue Op2 = Op.getOperand(2);
+    //  Only demand the lower 52-bits of operands 0 / 1 (and all 64-bits of
+    //  operand 2).
+    APInt Low52Bits = APInt::getLowBitsSet(BitWidth, 52);
+    if (SimplifyDemandedBits(Op0, Low52Bits, OriginalDemandedElts, KnownOp0,
+                             TLO, Depth + 1))
+      return true;
+
+    if (SimplifyDemandedBits(Op1, Low52Bits, OriginalDemandedElts, KnownOp1,
+                             TLO, Depth + 1))
+      return true;
+
+    if (SimplifyDemandedBits(Op2, APInt::getAllOnes(64), OriginalDemandedElts,
+                             KnownOp2, TLO, Depth + 1))
+      return true;
+
+    KnownBits KnownMul;
+    KnownOp0 = KnownOp0.trunc(52);
+    KnownOp1 = KnownOp1.trunc(52);
+    KnownMul = Opc == X86ISD::VPMADD52L ? KnownBits::mul(KnownOp0, KnownOp1)
+                                        : KnownBits::mulhu(KnownOp0, KnownOp1);
+    KnownMul = KnownMul.zext(64);
+
+    // lo/hi(X * Y) + Z --> C + Z
+    if (KnownMul.isConstant()) {
+      SDLoc DL(Op);
+      SDValue C = TLO.DAG.getConstant(KnownMul.getConstant(), DL, VT);
+      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ADD, DL, VT, C, Op2));
+    }
+
+    Known = KnownBits::add(KnownMul, KnownOp2);
+    return false;
+  }
   }
 
   return TargetLowering::SimplifyDemandedBitsForTargetNode(
@@ -45132,6 +45169,14 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
     bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
 
   switch (Op.getOpcode()) {
+  // SSE bit logic.
+  case X86ISD::FAND:
+  case X86ISD::FOR:
+  case X86ISD::FXOR:
+  case X86ISD::FANDN:
+  case X86ISD::ANDNP:
+  case X86ISD::VPTERNLOG:
+    return false;
   // SSE vector insert/extracts use modulo indices.
   case X86ISD::PINSRB:
   case X86ISD::PINSRW:
@@ -45167,6 +45212,11 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
   // SSE signbit extraction.
   case X86ISD::MOVMSK:
     return false;
+  // GFNI instructions.
+  case X86ISD::GF2P8AFFINEINVQB:
+  case X86ISD::GF2P8AFFINEQB:
+  case X86ISD::GF2P8MULB:
+    return false;
   case ISD::INTRINSIC_WO_CHAIN:
     switch (Op->getConstantOperandVal(0)) {
     case Intrinsic::x86_sse2_pmadd_wd:
@@ -48349,7 +48399,7 @@ static SDValue checkSignTestSetCCCombine(SDValue Cmp, X86::CondCode &CC,
   // If Src came from a SHL (probably from an expanded SIGN_EXTEND_INREG), then
   // peek through and adjust the TEST bit.
   if (Src.getOpcode() == ISD::SHL) {
-    if (std::optional<uint64_t> ShiftAmt = DAG.getValidShiftAmount(Src)) {
+    if (std::optional<unsigned> ShiftAmt = DAG.getValidShiftAmount(Src)) {
       Src = Src.getOperand(0);
       BitMask.lshrInPlace(*ShiftAmt);
     }
@@ -50886,10 +50936,12 @@ static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG,
 // Given a target type \p VT, we generate
 //   or (and x, y), (xor z, zext(build_vector (constants)))
 // given x, y and z are of type \p VT. We can do so, if operands are either
-// truncates from VT types, the second operand is a vector of constants or can
-// be recursively promoted.
+// truncates from VT types, the second operand is a vector of constants, can
+// be recursively promoted or is an existing extension we can extend further.
 static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT,
-                                     SelectionDAG &DAG, unsigned Depth) {
+                                     SelectionDAG &DAG,
+                                     const X86Subtarget &Subtarget,
+                                     unsigned Depth) {
   // Limit recursion to avoid excessive compile times.
   if (Depth >= SelectionDAG::MaxRecursionDepth)
     return SDValue();
@@ -50904,28 +50956,32 @@ static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT,
   if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT))
     return SDValue();
 
-  if (SDValue NN0 = PromoteMaskArithmetic(N0, DL, VT, DAG, Depth + 1))
+  if (SDValue NN0 =
+          PromoteMaskArithmetic(N0, DL, VT, DAG, Subtarget, Depth + 1))
     N0 = NN0;
   else {
-    // The left side has to be a trunc.
-    if (N0.getOpcode() != ISD::TRUNCATE)
-      return SDValue();
-
-    // The type of the truncated inputs.
-    if (N0.getOperand(0).getValueType() != VT)
+    // The left side has to be a 'trunc'.
+    bool LHSTrunc = N0.getOpcode() == ISD::TRUNCATE &&
+                    N0.getOperand(0).getValueType() == VT;
+    if (LHSTrunc)
+      N0 = N0.getOperand(0);
+    else
       return SDValue();
-
-    N0 = N0.getOperand(0);
   }
 
-  if (SDValue NN1 = PromoteMaskArithmetic(N1, DL, VT, DAG, Depth + 1))
+  if (SDValue NN1 =
+          PromoteMaskArithmetic(N1, DL, VT, DAG, Subtarget, Depth + 1))
     N1 = NN1;
   else {
-    // The right side has to be a 'trunc' or a (foldable) constant.
+    // The right side has to be a 'trunc', a (foldable) constant or an
+    // existing extension we can extend further.
     bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
                     N1.getOperand(0).getValueType() == VT;
     if (RHSTrunc)
       N1 = N1.getOperand(0);
+    else if (ISD::isExtVecInRegOpcode(N1.getOpcode()) && VT.is256BitVector() &&
+             Subtarget.hasInt256() && N1.hasOneUse())
+      N1 = DAG.getNode(N1.getOpcode(), DL, VT, N1.getOperand(0));
     else if (SDValue Cst =
                  DAG.FoldConstantArithmetic(ISD::ZERO_EXTEND, DL, VT, {N1}))
       N1 = Cst;
@@ -50955,7 +51011,7 @@ static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL,
   EVT NarrowVT = Narrow.getValueType();
 
   // Generate the wide operation.
-  SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, 0);
+  SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, Subtarget, 0);
   if (!Op)
     return SDValue();
   switch (N.getOpcode()) {
@@ -51804,6 +51860,8 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
     SDValue X, Y;
     EVT CondVT = VT.changeVectorElementType(MVT::i1);
     if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(CondVT) &&
+        (VT.is512BitVector() || Subtarget.hasVLX()) &&
+        (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
         sd_match(N, m_And(m_Value(X),
                           m_OneUse(m_SExt(m_AllOf(
                               m_Value(Y), m_SpecificVT(CondVT),
@@ -54135,10 +54193,10 @@ static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG,
 static SDValue combinei64TruncSrlConstant(SDValue N, EVT VT, SelectionDAG &DAG,
                                           const SDLoc &DL) {
   assert(N.getOpcode() == ISD::SRL && "Unknown shift opcode");
-  std::optional<uint64_t> ValidSrlConst = DAG.getValidShiftAmount(N);
+  std::optional<unsigned> ValidSrlConst = DAG.getValidShiftAmount(N);
   if (!ValidSrlConst)
     return SDValue();
-  uint64_t SrlConstVal = *ValidSrlConst;
+  unsigned SrlConstVal = *ValidSrlConst;
 
   SDValue Op = N.getOperand(0);
   unsigned Opcode = Op.getOpcode();
@@ -55368,6 +55426,8 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
     SDValue Src = N0.getOperand(0);
     EVT SrcVT = Src.getValueType();
     if (Src.getOpcode() == ISD::SETCC && SrcVT.getScalarType() == MVT::i1 &&
+        (VT.is512BitVector() || Subtarget.hasVLX()) &&
+        (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
         TLI.isTypeLegal(SrcVT) && N0.hasOneUse() && Src.hasOneUse())
       return DAG.getSelect(DL, VT, DAG.getNOT(DL, Src, SrcVT), N1,
                            getZeroVector(VT, Subtarget, DAG, DL));
@@ -56247,7 +56307,13 @@ static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC,
 
   SDValue Masked = BroadcastOp;
   if (N != 0) {
-    APInt Mask = APInt::getLowBitsSet(BroadcastOpVT.getSizeInBits(), Len);
+    unsigned BroadcastOpBitWidth = BroadcastOpVT.getSizeInBits();
+    unsigned NumDefinedElts = UndefElts.countTrailingZeros();
+
+    if (NumDefinedElts > BroadcastOpBitWidth)
+      return SDValue();
+
+    APInt Mask = APInt::getLowBitsSet(BroadcastOpBitWidth, NumDefinedElts);
     SDValue ShiftedValue = DAG.getNode(ISD::SRL, DL, BroadcastOpVT, BroadcastOp,
                                        DAG.getConstant(N, DL, BroadcastOpVT));
     Masked = DAG.getNode(ISD::AND, DL, BroadcastOpVT, ShiftedValue,
@@ -57904,6 +57970,51 @@ static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL,
                      Cmov.getOperand(3));
 }
 
+// Attempt to turn ADD(MUL(x, y), acc)) -> VPMADD52L
+// When upper 12 bits of x, y and MUL(x, y) are known to be 0
+static SDValue matchVPMADD52(SDNode *N, SelectionDAG &DAG, const SDLoc &DL,
+                             EVT VT, const X86Subtarget &Subtarget) {
+  using namespace SDPatternMatch;
+  if (!VT.isVector() || VT.getScalarSizeInBits() != 64 ||
+      (!Subtarget.hasAVXIFMA() && !Subtarget.hasIFMA()))
+    return SDValue();
+
+  // Need AVX-512VL vector length extensions if operating on XMM/YMM registers
+  if (!Subtarget.hasAVXIFMA() && !Subtarget.hasVLX() &&
+      VT.getSizeInBits() < 512)
+    return SDValue();
+
+  const auto TotalSize = VT.getSizeInBits();
+  if (TotalSize < 128 || !isPowerOf2_64(TotalSize))
+    return SDValue();
+
+  SDValue X, Y, Acc;
+  if (!sd_match(N, m_Add(m_Mul(m_Value(X), m_Value(Y)), m_Value(Acc))))
+    return SDValue();
+
+  KnownBits KnownX = DAG.computeKnownBits(X);
+  if (KnownX.countMinLeadingZeros() < 12)
+    return SDValue();
+  KnownBits KnownY = DAG.computeKnownBits(Y);
+  if (KnownY.countMinLeadingZeros() < 12)
+    return SDValue();
+  KnownBits KnownMul = KnownBits::mul(KnownX, KnownY);
+  if (KnownMul.countMinLeadingZeros() < 12)
+    return SDValue();
+
+  auto VPMADD52Builder = [](SelectionDAG &G, SDLoc DL,
+                            ArrayRef<SDValue> SubOps) {
+    EVT SubVT = SubOps[0].getValueType();
+    assert(SubVT.getScalarSizeInBits() == 64 &&
+           "Unexpected element size, only supports 64bit size");
+    return G.getNode(X86ISD::VPMADD52L, DL, SubVT, SubOps[1] /*X*/,
+                     SubOps[2] /*Y*/, SubOps[0] /*Acc*/);
+  };
+
+  return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Acc, X, Y}, VPMADD52Builder,
+                          /*CheckBWI*/ false);
+}
+
 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
                           TargetLowering::DAGCombinerInfo &DCI,
                           const X86Subtarget &Subtarget) {
@@ -58007,6 +58118,9 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
                        Op0.getOperand(0), Op0.getOperand(2));
   }
 
+  if (SDValue IFMA52 = matchVPMADD52(N, DAG, DL, VT, Subtarget))
+    return IFMA52;
+
   return combineAddOrSubToADCOrSBB(N, DL, DAG);
 }
 
@@ -60068,6 +60182,19 @@ static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+// Simplify VPMADD52L/VPMADD52H operations.
+static SDValue combineVPMADD52LH(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  MVT VT = N->getSimpleValueType(0);
+  unsigned NumEltBits = VT.getScalarSizeInBits();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
+                               DCI))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
 static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
                                           TargetLowering::DAGCombinerInfo &DCI,
                                           const X86Subtarget &Subtarget) {
@@ -60705,6 +60832,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::PMULUDQ:     return combinePMULDQ(N, DAG, DCI, Subtarget);
   case X86ISD::VPMADDUBSW:
   case X86ISD::VPMADDWD:    return combineVPMADD(N, DAG, DCI);
+  case X86ISD::VPMADD52L:
+  case X86ISD::VPMADD52H:    return combineVPMADD52LH(N, DAG, DCI);
   case X86ISD::KSHIFTL:
   case X86ISD::KSHIFTR:     return combineKSHIFT(N, DAG, DCI);
   case ISD::FP16_TO_FP:     return combineFP16_TO_FP(N, DAG, Subtarget);
@@ -60932,117 +61061,6 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
 //                           X86 Inline Assembly Support
 //===----------------------------------------------------------------------===//
 
-// Helper to match a string separated by whitespace.
-static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
-  S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
-
-  for (StringRef Piece : Pieces) {
-    if (!S.starts_with(Piece)) // Check if the piece matches.
-      return false;
-
-    S = S.substr(Piece.size());
-    StringRef::size_type Pos = S.find_first_not_of(" \t");
-    if (Pos == 0) // We matched a prefix.
-      return false;
-
-    S = S.substr(Pos);
-  }
-
-  return S.empty();
-}
-
-static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
-
-  if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
-    if (llvm::is_contained(AsmPieces, "~{cc}") &&
-        llvm::is_contained(AsmPieces, "~{flags}") &&
-        llvm::is_contained(AsmPieces, "~{fpsr}")) {
-
-      if (AsmPieces.size() == 3)
-        return true;
-      else if (llvm::is_contained(AsmPieces, "~{dirflag}"))
-        return true;
-    }
-  }
-  return false;
-}
-
-bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
-  InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
-
-  StringRef AsmStr = IA->getAsmString();
-
-  IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
-  if (!Ty || Ty->getBitWidth() % 16 != 0)
-    return false;
-
-  // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
-  SmallVector<StringRef, 4> AsmPieces;
-  SplitString(AsmStr, AsmPieces, ";\n");
-
-  switch (AsmPieces.size()) {
-  default: return false;
-  case 1:
-    // FIXME: this should verify that we are targeting a 486 or better.  If not,
-    // we will turn this bswap into something that will be lowered to logical
-    // ops instead of emitting the bswap asm.  For now, we don't support 486 or
-    // lower so don't worry about this.
-    // bswap $0
-    if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
-        matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
-        matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
-        matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
-        matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
-        matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
-      // No need to check constraints, nothing other than the equivalent of
-      // "=r,0" would be valid here.
-      return IntrinsicLowering::LowerToByteSwap(CI);
-    }
-
-    // rorw $$8, ${0:w}  -->  llvm.bswap.i16
-    if (CI->getType()->isIntegerTy(16) &&
-        IA->getConstraintString().starts_with("=r,0,") &&
-        (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
-         matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
-      AsmPieces.clear();
-      StringRef ConstraintsStr = IA->getConstraintString();
-      SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
-      array_pod_sort(AsmPieces.begin(), AsmPieces.end());
-      if (clobbersFlagRegisters(AsmPieces))
-        return IntrinsicLowering::LowerToByteSwap(CI);
-    }
-    break;
-  case 3:
-    if (CI->getType()->isIntegerTy(32) &&
-        IA->getConstraintString().starts_with("=r,0,") &&
-        matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
-        matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
-        matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
-      AsmPieces.clear();
-      StringRef ConstraintsStr = IA->getConstraintString();
-      SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
-      array_pod_sort(AsmPieces.begin(), AsmPieces.end());
-      if (clobbersFlagRegisters(AsmPieces))
-        return IntrinsicLowering::LowerToByteSwap(CI);
-    }
-
-    if (CI->getType()->isIntegerTy(64)) {
-      InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
-      if (Constraints.size() >= 2 &&
-          Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
-          Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
-        // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
-        if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
-            matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
-            matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
-          return IntrinsicLowering::LowerToByteSwap(CI);
-      }
-    }
-    break;
-  }
-  return false;
-}
-
 static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
   X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)
                            .Case("{@cca}", X86::COND_A)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 97d3b6e2420d..0c9ba591b03e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1364,8 +1364,6 @@ namespace llvm {
 
     SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
 
-    bool ExpandInlineAsm(CallInst *CI) const override;
-
     ConstraintType getConstraintType(StringRef Constraint) const override;
 
     /// Examine constraint string and operand type and determine a weight value.
@@ -1668,8 +1666,8 @@ namespace llvm {
     /// Lower interleaved store(s) into target specific
     /// instructions/intrinsics.
     bool lowerInterleavedStore(Instruction *Store, Value *Mask,
-                               ShuffleVectorInst *SVI,
-                               unsigned Factor) const override;
+                               ShuffleVectorInst *SVI, unsigned Factor,
+                               const APInt &GapMask) const override;
 
     SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr,
                                    int JTI, SelectionDAG &DAG) const override;
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 1c745a338a61..3bc46af4d130 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -302,7 +302,7 @@ EVT X86TargetLowering::getOptimalMemOpType(
     if (Op.size() >= 16 &&
         (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
       // FIXME: Check if unaligned 64-byte accesses are slow.
-      if (Op.size() >= 64 && Subtarget.hasAVX512() && Subtarget.hasEVEX512() &&
+      if (Op.size() >= 64 && Subtarget.hasAVX512() &&
           (Subtarget.getPreferVectorWidth() >= 512)) {
         return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
       }
@@ -416,7 +416,7 @@ bool X86TargetLowering::allowsMemoryAccess(LLVMContext &Context,
         return true;
       return false;
     case 512:
-      if (Subtarget.hasAVX512() && Subtarget.hasEVEX512())
+      if (Subtarget.hasAVX512())
         return true;
       return false;
     default:
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index 1beaaafb159e..69a5115201ef 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -550,7 +550,7 @@ let Predicates = [HasAMXMOVRS, In64BitMode], SchedRW = [WriteSystem] in {
 } // HasAMXMOVRS, In64BitMode
 
 multiclass m_tcvtrowd2ps {
-  let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in {
+  let Predicates = [HasAMXAVX512, HasAVX10_2, In64BitMode] in {
     let SchedRW = [WriteSystem] in {
       def rri : Ii8<0x7, MRMSrcReg, (outs VR512:$dst),
                     (ins TILE:$src1, i32u8imm:$src2),
@@ -561,12 +561,12 @@ multiclass m_tcvtrowd2ps {
                   "tcvtrowd2ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                   []>, T8,XS, EVEX, VVVV, EVEX_V512;
     }
-  } // HasAMXAVX512, HasAVX10_2_512, In64BitMode
+  } // HasAMXAVX512, HasAVX10_2, In64BitMode
 }
 
 defm TCVTROWD2PS : m_tcvtrowd2ps;
 
-let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in {
+let Predicates = [HasAMXAVX512, HasAVX10_2, In64BitMode] in {
   let SchedRW = [WriteSystem] in {
     let  usesCustomInserter = 1 in {
       def PTCVTROWD2PSrri : PseudoI<(outs VR512:$dst), (ins u8imm:$src1, i32u8imm:$src2),
@@ -630,7 +630,7 @@ let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in {
 
 multiclass AMXAVX512_BASE<bits<8> Opcode1, bits<8> Opcode2, string Opstr,
                                 Prefix P1, Prefix P2> {
-  let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode], SchedRW = [WriteSystem] in {
+  let Predicates = [HasAMXAVX512, HasAVX10_2, In64BitMode], SchedRW = [WriteSystem] in {
     let OpPrefix = P1 in
       def rre : I<Opcode1, MRMSrcReg4VOp3, (outs VR512:$dst),
                   (ins TILE:$src1, GR32:$src2),
@@ -658,7 +658,7 @@ defm TCVTROWPS2BF16H : AMXAVX512_BASE<0x6d, 0x07, "tcvtrowps2bf16h", XD, XD>;
 defm TCVTROWPS2BF16L : AMXAVX512_BASE<0x6d, 0x77, "tcvtrowps2bf16l", XS, XS>;
 
 multiclass m_tilemovrow {
-  let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in {
+  let Predicates = [HasAMXAVX512, HasAVX10_2, In64BitMode] in {
     let SchedRW = [WriteSystem] in {
       def rri : Ii8<0x7, MRMSrcReg, (outs VR512:$dst),
                     (ins TILE:$src1, u8imm:$src2),
@@ -669,12 +669,12 @@ multiclass m_tilemovrow {
                   "tilemovrow\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                   []>, T8,PD, EVEX, VVVV, EVEX_V512;
     }
-  } // HasAMXAVX512, HasAVX10_2_512, In64BitMode
+  } // HasAMXAVX512, HasAVX10_2, In64BitMode
 }
 
 defm TILEMOVROW : m_tilemovrow;
 
-let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in {
+let Predicates = [HasAMXAVX512, HasAVX10_2, In64BitMode] in {
   let SchedRW = [WriteSystem] in {
     let  usesCustomInserter = 1 in {
       def PTILEMOVROWrri : PseudoI<(outs VR512:$dst), (ins u8imm:$src1, i32u8imm:$src2),
diff --git a/llvm/lib/Target/X86/X86InstrAVX10.td b/llvm/lib/Target/X86/X86InstrAVX10.td
index 2d2bf1f6c725..764ff998bb56 100644
--- a/llvm/lib/Target/X86/X86InstrAVX10.td
+++ b/llvm/lib/Target/X86/X86InstrAVX10.td
@@ -15,36 +15,36 @@
 // VNNI FP16
 let ExeDomain = SSEPackedSingle in
 defm VDPPHPS : avx512_dpf16ps_sizes<0x52, "vdpphps", X86dpfp16ps, avx512vl_f16_info,
-                                    [HasAVX10_2], [HasAVX10_2_512]>,
+                                    [HasAVX10_2], [HasAVX10_2]>,
                     T8, PS, EVEX_CD8<32, CD8VF>;
 
 // VNNI INT8
 defm VPDPBSSD   : VNNI_common<0x50, "vpdpbssd", X86vpdpbssd, SchedWriteVecIMul, 1,
-                              [HasAVX10_2], [HasAVX10_2_512]>, XD;
+                              [HasAVX10_2], [HasAVX10_2]>, XD;
 defm VPDPBSSDS  : VNNI_common<0x51, "vpdpbssds", X86vpdpbssds, SchedWriteVecIMul, 1,
-                              [HasAVX10_2], [HasAVX10_2_512]>, XD;
+                              [HasAVX10_2], [HasAVX10_2]>, XD;
 defm VPDPBSUD   : VNNI_common<0x50, "vpdpbsud", X86vpdpbsud, SchedWriteVecIMul, 0,
-                              [HasAVX10_2], [HasAVX10_2_512]>, XS;
+                              [HasAVX10_2], [HasAVX10_2]>, XS;
 defm VPDPBSUDS  : VNNI_common<0x51, "vpdpbsuds", X86vpdpbsuds, SchedWriteVecIMul, 0,
-                              [HasAVX10_2], [HasAVX10_2_512]>, XS;
+                              [HasAVX10_2], [HasAVX10_2]>, XS;
 defm VPDPBUUD   : VNNI_common<0x50, "vpdpbuud", X86vpdpbuud, SchedWriteVecIMul, 1,
-                              [HasAVX10_2], [HasAVX10_2_512]>, PS;
+                              [HasAVX10_2], [HasAVX10_2]>, PS;
 defm VPDPBUUDS  : VNNI_common<0x51, "vpdpbuuds", X86vpdpbuuds, SchedWriteVecIMul, 1,
-                              [HasAVX10_2], [HasAVX10_2_512]>, PS;
+                              [HasAVX10_2], [HasAVX10_2]>, PS;
 
 // VNNI INT16
 defm VPDPWSUD   : VNNI_common<0xd2, "vpdpwsud", X86vpdpwsud, SchedWriteVecIMul, 0,
-                              [HasAVX10_2], [HasAVX10_2_512]>, XS;
+                              [HasAVX10_2], [HasAVX10_2]>, XS;
 defm VPDPWSUDS  : VNNI_common<0xd3, "vpdpwsuds", X86vpdpwsuds, SchedWriteVecIMul, 0,
-                              [HasAVX10_2], [HasAVX10_2_512]>, XS;
+                              [HasAVX10_2], [HasAVX10_2]>, XS;
 defm VPDPWUSD   : VNNI_common<0xd2, "vpdpwusd", X86vpdpwusd, SchedWriteVecIMul, 0,
-                              [HasAVX10_2], [HasAVX10_2_512]>, PD;
+                              [HasAVX10_2], [HasAVX10_2]>, PD;
 defm VPDPWUSDS  : VNNI_common<0xd3, "vpdpwusds", X86vpdpwusds, SchedWriteVecIMul, 0,
-                              [HasAVX10_2], [HasAVX10_2_512]>, PD;
+                              [HasAVX10_2], [HasAVX10_2]>, PD;
 defm VPDPWUUD   : VNNI_common<0xd2, "vpdpwuud", X86vpdpwuud, SchedWriteVecIMul, 1,
-                              [HasAVX10_2], [HasAVX10_2_512]>, PS;
+                              [HasAVX10_2], [HasAVX10_2]>, PS;
 defm VPDPWUUDS  : VNNI_common<0xd3, "vpdpwuuds", X86vpdpwuuds, SchedWriteVecIMul, 1,
-                              [HasAVX10_2], [HasAVX10_2_512]>, PS;
+                              [HasAVX10_2], [HasAVX10_2]>, PS;
 
 // VMPSADBW
 defm VMPSADBW : avx512_common_3Op_rm_imm8<0x42, X86Vmpsadbw, "vmpsadbw", SchedWritePSADBW,
@@ -94,9 +94,8 @@ multiclass avx10_minmax_packed_sae<string OpStr, AVX512VLVectorVTInfo VTI, SDNod
 }
 
 multiclass avx10_minmax_packed<string OpStr, AVX512VLVectorVTInfo VTI, SDNode OpNode> {
-  let Predicates = [HasAVX10_2_512] in
-    defm Z    :   avx10_minmax_packed_base<OpStr, VTI.info512, OpNode>, EVEX_V512;
   let Predicates = [HasAVX10_2] in {
+    defm Z    :   avx10_minmax_packed_base<OpStr, VTI.info512, OpNode>, EVEX_V512;
     defm Z256 :   avx10_minmax_packed_base<OpStr, VTI.info256, OpNode>, EVEX_V256;
     defm Z128 :   avx10_minmax_packed_base<OpStr, VTI.info128, OpNode>, EVEX_V128;
   }
@@ -201,7 +200,7 @@ multiclass avx10_sat_cvt_rmb<bits<8> Opc, string OpStr, X86FoldableSchedWrite sc
 multiclass avx10_sat_cvt_rc<bits<8> Opc, string OpStr, X86SchedWriteWidths sched,
                             AVX512VLVectorVTInfo DestInfo, AVX512VLVectorVTInfo SrcInfo,
                             SDNode MaskNode> {
-  let Predicates = [HasAVX10_2_512], Uses = [MXCSR] in
+  let Predicates = [HasAVX10_2], Uses = [MXCSR] in
   defm Zrrb : AVX512_maskable<Opc, MRMSrcReg, DestInfo.info512,
                               (outs DestInfo.info512.RC:$dst),
                               (ins SrcInfo.info512.RC:$src, AVX512RC:$rc),
@@ -216,7 +215,7 @@ multiclass avx10_sat_cvt_rc<bits<8> Opc, string OpStr, X86SchedWriteWidths sched
 multiclass avx10_sat_cvt_sae<bits<8> Opc, string OpStr, X86SchedWriteWidths sched,
                              AVX512VLVectorVTInfo DestInfo, AVX512VLVectorVTInfo SrcInfo,
                              SDNode Node> {
-  let Predicates = [HasAVX10_2_512], Uses = [MXCSR] in
+  let Predicates = [HasAVX10_2], Uses = [MXCSR] in
   defm Zrrb : AVX512_maskable<Opc, MRMSrcReg, DestInfo.info512,
                              (outs DestInfo.info512.RC:$dst),
                              (ins SrcInfo.info512.RC:$src),
@@ -229,12 +228,11 @@ multiclass avx10_sat_cvt_sae<bits<8> Opc, string OpStr, X86SchedWriteWidths sche
 multiclass avx10_sat_cvt_base<bits<8> Opc, string OpStr, X86SchedWriteWidths sched,
                                SDNode MaskNode, AVX512VLVectorVTInfo DestInfo,
                                AVX512VLVectorVTInfo SrcInfo> {
-  let Predicates = [HasAVX10_2_512] in
-  defm Z : avx10_sat_cvt_rmb<Opc, OpStr, sched.ZMM,
-           DestInfo.info512, SrcInfo.info512,
-           MaskNode>,
-      EVEX, EVEX_V512;
   let Predicates = [HasAVX10_2] in {
+    defm Z : avx10_sat_cvt_rmb<Opc, OpStr, sched.ZMM,
+             DestInfo.info512, SrcInfo.info512,
+             MaskNode>,
+        EVEX, EVEX_V512;
     defm Z256
         : avx10_sat_cvt_rmb<Opc, OpStr, sched.YMM,
            DestInfo.info256, SrcInfo.info256,
@@ -334,13 +332,11 @@ defm VCVTTPS2IUBS : avx10_sat_cvt_base<0x6a, "vcvttps2iubs", SchedWriteVecIMul,
 multiclass avx10_cvttpd2dqs<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
                             SDNode MaskOpNode, SDNode OpNodeSAE,
                             X86SchedWriteWidths sched> {
-  let Predicates = [HasAVX10_2_512] in {
+  let Predicates = [HasAVX10_2] in {
     defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode,
                             MaskOpNode, sched.ZMM>,
              avx512_vcvt_fp_sae<opc, OpcodeStr, v8i32x_info, v8f64_info,
                                OpNodeSAE, sched.ZMM>, EVEX_V512;
-  }
-  let Predicates = [HasAVX10_2] in {
     defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info,
                               null_frag, null_frag, sched.XMM, "{1to2}", "{x}",
                               f128mem, VK2WM>, EVEX_V128;
@@ -410,13 +406,11 @@ multiclass avx10_cvttpd2dqs<bits<8> opc, string OpcodeStr, SDPatternOperator OpN
 multiclass avx10_cvttpd2qqs<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
                             SDNode MaskOpNode, SDNode OpNodeRnd,
                             X86SchedWriteWidths sched> {
-   let Predicates = [HasAVX10_2_512] in {
+   let Predicates = [HasAVX10_2] in {
      defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode,
                             MaskOpNode, sched.ZMM>,
               avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f64_info,
                                 OpNodeRnd, sched.ZMM>, EVEX_V512;
-   }
-   let Predicates = [HasAVX10_2] in {
      defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode,
                                MaskOpNode, sched.XMM>, EVEX_V128;
      defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode,
@@ -432,13 +426,11 @@ multiclass avx10_cvttpd2qqs<bits<8> opc, string OpcodeStr, SDPatternOperator OpN
 multiclass avx10_cvttps2qqs<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
                             SDNode MaskOpNode, SDNode OpNodeRnd,
                             X86SchedWriteWidths sched> {
-  let Predicates = [HasAVX10_2_512] in {
+  let Predicates = [HasAVX10_2] in {
     defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode,
                            MaskOpNode, sched.ZMM>,
              avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f32x_info,
                                OpNodeRnd, sched.ZMM>, EVEX_V512;
-  }
-  let Predicates = [HasAVX10_2] in {
     defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode,
                               MaskOpNode, sched.XMM, "{1to2}", "", f64mem, VK2WM,
                               (v2i64 (OpNode (bc_v4f32 (v2f64
@@ -460,14 +452,11 @@ multiclass avx10_cvttps2qqs<bits<8> opc, string OpcodeStr, SDPatternOperator OpN
 multiclass avx10_cvttps2dqs<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
                             SDNode MaskOpNode,
                             SDNode OpNodeSAE, X86SchedWriteWidths sched> {
-  let Predicates = [HasAVX10_2_512] in {
+  let Predicates = [HasAVX10_2] in {
     defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode,
                            MaskOpNode, sched.ZMM>,
              avx512_vcvt_fp_sae<opc, OpcodeStr, v16i32_info, v16f32_info,
                                 OpNodeSAE, sched.ZMM>, EVEX_V512;
-  }
-
-  let Predicates = [HasAVX10_2] in {
     defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode,
                               MaskOpNode, sched.XMM>, EVEX_V128;
     defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode,
@@ -719,7 +708,7 @@ multiclass avx10_cvt2ps2ph<bits<8> opc, string OpcodeStr,
                            AVX512VLVectorVTInfo _SrcVTInfo,
                            AVX512VLVectorVTInfo _DstVTInfo,
                            SDNode OpNode, SDNode OpNodeRnd> {
-  let Predicates = [HasAVX10_2_512], Uses = [MXCSR] in {
+  let Predicates = [HasAVX10_2] in {
     defm Z : avx512_binop_rm2<opc, OpcodeStr, sched.ZMM, OpNode,
                               _SrcVTInfo.info512, _DstVTInfo.info512,
                               _SrcVTInfo.info512>,
@@ -727,8 +716,6 @@ multiclass avx10_cvt2ps2ph<bits<8> opc, string OpcodeStr,
                                 _SrcVTInfo.info512, _DstVTInfo.info512,
                                 OpNodeRnd>,
              EVEX_V512, EVEX_CD8<32, CD8VF>;
-  }
-  let Predicates = [HasAVX10_2] in {
     defm Z256 : avx512_binop_rm2<opc, OpcodeStr, sched.YMM, OpNode,
                                  _SrcVTInfo.info256, _DstVTInfo.info256,
                                  _SrcVTInfo.info256>,
@@ -747,19 +734,19 @@ defm VCVT2PS2PHX : avx10_cvt2ps2ph<0x67, "vcvt2ps2phx",
 
 defm VCVT2PH2BF8 : avx512_binop_all<0x74, "vcvt2ph2bf8", SchedWriteCvtPD2PS,
                                      avx512vl_f16_info, avx512vl_i8_info,
-                                     X86vcvt2ph2bf8, [HasAVX10_2_512], [HasAVX10_2]>,
+                                     X86vcvt2ph2bf8, [HasAVX10_2], [HasAVX10_2]>,
                                     EVEX_CD8<16, CD8VF>, T8, XD;
 defm VCVT2PH2BF8S : avx512_binop_all<0x74, "vcvt2ph2bf8s", SchedWriteCvtPD2PS,
                                       avx512vl_f16_info, avx512vl_i8_info,
-                                      X86vcvt2ph2bf8s, [HasAVX10_2_512], [HasAVX10_2]>,
+                                      X86vcvt2ph2bf8s, [HasAVX10_2], [HasAVX10_2]>,
                                      EVEX_CD8<16, CD8VF>, T_MAP5, XD;
 defm VCVT2PH2HF8 : avx512_binop_all<0x18, "vcvt2ph2hf8", SchedWriteCvtPD2PS,
                                      avx512vl_f16_info, avx512vl_i8_info,
-                                     X86vcvt2ph2hf8, [HasAVX10_2_512], [HasAVX10_2]>,
+                                     X86vcvt2ph2hf8, [HasAVX10_2], [HasAVX10_2]>,
                                     EVEX_CD8<16, CD8VF>, T_MAP5, XD;
 defm VCVT2PH2HF8S : avx512_binop_all<0x1b, "vcvt2ph2hf8s", SchedWriteCvtPD2PS,
                                       avx512vl_f16_info, avx512vl_i8_info,
-                                      X86vcvt2ph2hf8s, [HasAVX10_2_512], [HasAVX10_2]>,
+                                      X86vcvt2ph2hf8s, [HasAVX10_2], [HasAVX10_2]>,
                                      EVEX_CD8<16, CD8VF>, T_MAP5, XD;
 
 //TODO: Merge into avx512_vcvt_fp, diffrence is one more source register here.
@@ -836,11 +823,10 @@ multiclass avx10_convert_3op<bits<8> OpCode, string OpcodeStr,
            PatFrag bcast128 = vt_src.info128.BroadcastLdFrag,
            PatFrag loadVT128 = vt_src.info128.LdFrag,
            RegisterClass maskRC128 = vt_src.info128.KRCWM> {
-  let Predicates = [HasAVX10_2_512] in
+  let Predicates = [HasAVX10_2] in {
     defm Z : avx10_convert_3op_packed<OpCode, OpcodeStr, vt_dst.info256,
                vt_dst.info512, vt_src.info512, OpNode, OpNode, sched.ZMM>,
                EVEX_V512, EVEX_CD8<16, CD8VF>;
-  let Predicates = [HasAVX10_2] in {
     defm Z256 : avx10_convert_3op_packed<OpCode, OpcodeStr, vt_dst.info128,
                   vt_dst.info256, vt_src.info256, OpNode, OpNode, sched.YMM>,
                   EVEX_V256, EVEX_CD8<16, CD8VF>;
@@ -920,25 +906,25 @@ defm VCVTBIASPH2HF8S : avx10_convert_3op<0x1b, "vcvtbiasph2hf8s",
 defm VCVTPH2BF8 : avx512_cvt_trunc_ne<0x74, "vcvtph2bf8", avx512vl_i8_info,
                                         avx512vl_f16_info, SchedWriteCvtPD2PS,
                                         X86vcvtph2bf8, X86vmcvtph2bf8,
-                                        [HasAVX10_2], [HasAVX10_2_512]>,
+                                        [HasAVX10_2], [HasAVX10_2]>,
                                         T8, XS, EVEX_CD8<16, CD8VF>;
 
 defm VCVTPH2BF8S : avx512_cvt_trunc_ne<0x74, "vcvtph2bf8s", avx512vl_i8_info,
                                          avx512vl_f16_info, SchedWriteCvtPD2PS,
                                          X86vcvtph2bf8s, X86vmcvtph2bf8s,
-                                         [HasAVX10_2], [HasAVX10_2_512]>,
+                                         [HasAVX10_2], [HasAVX10_2]>,
                                          T_MAP5, XS, EVEX_CD8<16, CD8VF>;
 
 defm VCVTPH2HF8 : avx512_cvt_trunc_ne<0x18, "vcvtph2hf8", avx512vl_i8_info,
                                         avx512vl_f16_info, SchedWriteCvtPD2PS,
                                         X86vcvtph2hf8, X86vmcvtph2hf8,
-                                        [HasAVX10_2], [HasAVX10_2_512]>,
+                                        [HasAVX10_2], [HasAVX10_2]>,
                                         T_MAP5, XS, EVEX_CD8<16, CD8VF>;
 
 defm VCVTPH2HF8S : avx512_cvt_trunc_ne<0x1b, "vcvtph2hf8s", avx512vl_i8_info,
                                          avx512vl_f16_info, SchedWriteCvtPD2PS,
                                          X86vcvtph2hf8s, X86vmcvtph2hf8s,
-                                         [HasAVX10_2], [HasAVX10_2_512]>,
+                                         [HasAVX10_2], [HasAVX10_2]>,
                                          T_MAP5, XS, EVEX_CD8<16, CD8VF>;
 
 multiclass avx10_convert_2op_nomb_packed<bits<8> opc, string OpcodeStr,
@@ -962,10 +948,9 @@ multiclass avx10_convert_2op_nomb_packed<bits<8> opc, string OpcodeStr,
 
 multiclass avx10_convert_2op_nomb<string OpcodeStr, AVX512VLVectorVTInfo _dest,
                                   AVX512VLVectorVTInfo _src, bits<8> opc, SDNode OpNode> {
-  let Predicates = [HasAVX10_2_512] in
+  let Predicates = [HasAVX10_2] in {
   defm Z : avx10_convert_2op_nomb_packed<opc, OpcodeStr, _dest.info512, _src.info256,
                                          OpNode, f256mem, WriteCvtPH2PSZ>, EVEX_V512;
-  let Predicates = [HasAVX10_2] in {
   defm Z128 : avx10_convert_2op_nomb_packed<opc, OpcodeStr, _dest.info128, _src.info128,
                                             OpNode, f64mem, WriteCvtPH2PSZ>, EVEX_V128;
   defm Z256 : avx10_convert_2op_nomb_packed<opc, OpcodeStr, _dest.info256, _src.info128,
@@ -985,13 +970,12 @@ defm VCVTHF82PH : avx10_convert_2op_nomb<"vcvthf82ph", avx512vl_f16_info,
 multiclass avx10_fp_binop_int_bf16<bits<8> opc, string OpcodeStr,
                                       X86SchedWriteSizes sched,
                                       bit IsCommutable = 0> {
-  let Predicates = [HasAVX10_2_512] in
+  let Predicates = [HasAVX10_2] in {
     defm Z : avx512_fp_packed<opc, OpcodeStr,
                               !cast<Intrinsic>("int_x86_avx10_"#OpcodeStr#"bf16512"),
                               !cast<Intrinsic>("int_x86_avx10_"#OpcodeStr#"bf16512"),
                               v32bf16_info, sched.PH.ZMM, IsCommutable>, EVEX_V512,
                               T_MAP5, PD, EVEX_CD8<16, CD8VF>;
-  let Predicates = [HasAVX10_2] in {
     defm Z128 : avx512_fp_packed<opc, OpcodeStr,
                                  !cast<Intrinsic>("int_x86_avx10_"#OpcodeStr#"bf16128"),
                                  !cast<Intrinsic>("int_x86_avx10_"#OpcodeStr#"bf16128"),
@@ -1009,11 +993,10 @@ multiclass avx10_fp_binop_bf16<bits<8> opc, string OpcodeStr, SDPatternOperator
                                 X86SchedWriteSizes sched,
                                 bit IsCommutable = 0,
                                 SDPatternOperator MaskOpNode = OpNode> {
-  let Predicates = [HasAVX10_2_512] in
+  let Predicates = [HasAVX10_2] in {
     defm Z : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, 
                               v32bf16_info, sched.PH.ZMM, IsCommutable>, EVEX_V512,
                               T_MAP5, PD, EVEX_CD8<16, CD8VF>;
-  let Predicates = [HasAVX10_2] in {
     defm Z128 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, 
                                  v8bf16x_info, sched.PH.XMM, IsCommutable>, EVEX_V128,
                                  T_MAP5, PD, EVEX_CD8<16, CD8VF>;
@@ -1086,9 +1069,8 @@ multiclass avx10_vcmp_common_bf16<X86FoldableSchedWrite sched, X86VectorVTInfo _
 }
 
 multiclass avx10_vcmp_bf16<X86SchedWriteWidths sched, AVX512VLVectorVTInfo _> {
-  let Predicates = [HasAVX10_2_512] in
-    defm Z : avx10_vcmp_common_bf16<sched.ZMM, _.info512>, EVEX_V512;
   let Predicates = [HasAVX10_2] in {
+    defm Z : avx10_vcmp_common_bf16<sched.ZMM, _.info512>, EVEX_V512;
     defm Z128 : avx10_vcmp_common_bf16<sched.XMM, _.info128>, EVEX_V128;
     defm Z256 : avx10_vcmp_common_bf16<sched.YMM, _.info256>, EVEX_V256;
   }
@@ -1102,11 +1084,10 @@ defm VCMPBF16 : avx10_vcmp_bf16<SchedWriteFCmp, avx512vl_bf16_info>,
 // VSQRTBF16
 multiclass avx10_sqrt_packed_bf16<bits<8> opc, string OpcodeStr,
                                   X86SchedWriteSizes sched> {
-  let Predicates = [HasAVX10_2_512] in
-  defm Z : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "bf16"),
-                              sched.PH.ZMM, v32bf16_info>,
-                              EVEX_V512, PD, T_MAP5, EVEX_CD8<16, CD8VF>;
   let Predicates = [HasAVX10_2] in {
+    defm Z : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "bf16"),
+                                sched.PH.ZMM, v32bf16_info>,
+                                EVEX_V512, PD, T_MAP5, EVEX_CD8<16, CD8VF>;
     defm Z128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "bf16"),
                                    sched.PH.XMM, v8bf16x_info>,
                                    EVEX_V128, PD, T_MAP5, EVEX_CD8<16, CD8VF>;
@@ -1122,11 +1103,10 @@ defm VSQRTBF16 : avx10_sqrt_packed_bf16<0x51, "vsqrt", SchedWriteFSqrtSizes>;
 // VRSQRTBF16, VRCPBF16, VSRQTBF16, VGETEXPBF16
 multiclass avx10_fp14_bf16<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             X86SchedWriteWidths sched> {
-  let Predicates = [HasAVX10_2_512] in
-  defm BF16Z : avx512_fp14_p<opc, !strconcat(OpcodeStr, "bf16"),
-                             OpNode, sched.ZMM, v32bf16_info>,
-                             EVEX_V512;
   let Predicates = [HasAVX10_2] in {
+    defm BF16Z : avx512_fp14_p<opc, !strconcat(OpcodeStr, "bf16"),
+                               OpNode, sched.ZMM, v32bf16_info>,
+                               EVEX_V512;
     defm BF16Z128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "bf16"),
                                   OpNode, sched.XMM, v8bf16x_info>,
                                   EVEX_V128;
@@ -1146,10 +1126,9 @@ defm VGETEXP : avx10_fp14_bf16<0x42, "vgetexp", X86fgetexp, SchedWriteFRnd>,
 // VSCALEFBF16
 multiclass avx10_fp_scalef_bf16<bits<8> opc, string OpcodeStr,
                                 X86SchedWriteWidths sched> {
-  let Predicates = [HasAVX10_2_512] in
+  let Predicates = [HasAVX10_2] in {
     defm Z : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.ZMM, v32bf16_info>,
                                 EVEX_V512, T_MAP6, PS, EVEX_CD8<16, CD8VF>;
-  let Predicates = [HasAVX10_2] in {
     defm Z128 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.XMM, v8bf16x_info>,
                                    EVEX_V128, EVEX_CD8<16, CD8VF>, T_MAP6, PS;
     defm Z256 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.YMM, v16bf16x_info>,
@@ -1164,10 +1143,9 @@ defm VSCALEFBF16 : avx10_fp_scalef_bf16<0x2C, "vscalef", SchedWriteFAdd>;
 multiclass avx10_common_unary_fp_packed_imm_bf16<string OpcodeStr,
             AVX512VLVectorVTInfo _, bits<8> opc, SDPatternOperator OpNode,
             SDPatternOperator MaskOpNode, X86SchedWriteWidths sched> {
-  let Predicates = [HasAVX10_2_512] in
+  let Predicates = [HasAVX10_2] in {
     defm Z    : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, MaskOpNode,
                                            sched.ZMM, _.info512>, EVEX_V512;
-  let Predicates = [HasAVX10_2] in {
     defm Z128 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, MaskOpNode,
                                            sched.XMM, _.info128>, EVEX_V128;
     defm Z256 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, MaskOpNode,
@@ -1190,11 +1168,10 @@ defm VGETMANTBF16 : avx10_common_unary_fp_packed_imm_bf16<"vgetmant", avx512vl_b
 // VFPCLASSBF16
 multiclass avx10_fp_fpclass_bf16<string OpcodeStr, bits<8> opcVec,
                                   X86SchedWriteWidths sched> {
-  let Predicates = [HasAVX10_2_512] in
+  let Predicates = [HasAVX10_2] in {
     defm Z : avx512_vector_fpclass<opcVec, OpcodeStr, sched.ZMM,
                                    avx512vl_bf16_info.info512, "z",
                                    []<Register>>, EVEX_V512;
-  let Predicates = [HasAVX10_2] in {
     defm Z128 : avx512_vector_fpclass<opcVec, OpcodeStr, sched.XMM,
                                       avx512vl_bf16_info.info128, "x",
                                       []<Register>>, EVEX_V128;
@@ -1211,11 +1188,10 @@ defm VFPCLASSBF16 : avx10_fp_fpclass_bf16<"vfpclass", 0x66, SchedWriteFCmp>,
 multiclass avx10_fma3p_213_bf16<bits<8> opc, string OpcodeStr,
                                 SDPatternOperator OpNode, SDNode MaskOpNode,
                                 X86SchedWriteWidths sched> {
-  let Predicates = [HasAVX10_2_512] in
+  let Predicates = [HasAVX10_2] in {
     defm Z : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, MaskOpNode,
                                sched.ZMM, v32bf16_info>, EVEX_V512, T_MAP6, PS,
                                EVEX_CD8<16, CD8VF>;
-  let Predicates = [HasAVX10_2] in {
     defm Z128 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, MaskOpNode,
                                sched.XMM, v8bf16x_info>, EVEX_V128, T_MAP6, PS,
                                EVEX_CD8<16, CD8VF>;
@@ -1239,11 +1215,10 @@ defm VFNMSUB213BF16 : avx10_fma3p_213_bf16<0xAE, "vfnmsub213bf16", X86any_Fnmsub
 multiclass avx10_fma3p_231_bf16<bits<8> opc, string OpcodeStr,
                                  SDPatternOperator OpNode, SDNode MaskOpNode,
                                  X86SchedWriteWidths sched> {
-  let Predicates = [HasAVX10_2_512] in
+  let Predicates = [HasAVX10_2] in {
     defm Z : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, MaskOpNode,
                                sched.ZMM, v32bf16_info>, EVEX_V512, T_MAP6, PS,
                                EVEX_CD8<16, CD8VF>;
-  let Predicates = [HasAVX10_2] in {
     defm Z128 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, MaskOpNode,
                                sched.XMM, v8bf16x_info>, EVEX_V128, T_MAP6, PS,
                                EVEX_CD8<16, CD8VF>;
@@ -1267,11 +1242,10 @@ defm VFNMSUB231BF16 : avx10_fma3p_231_bf16<0xBE, "vfnmsub231bf16", X86any_Fnmsub
 multiclass avx10_fma3p_132_bf16<bits<8> opc, string OpcodeStr,
                                  SDPatternOperator OpNode, SDNode MaskOpNode,
                                  X86SchedWriteWidths sched> {
-  let Predicates = [HasAVX10_2_512] in
+  let Predicates = [HasAVX10_2] in {
     defm Z : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, MaskOpNode,
                                  sched.ZMM, v32bf16_info>, EVEX_V512, T_MAP6, PS,
                                  EVEX_CD8<16, CD8VF>;
-  let Predicates = [HasAVX10_2] in {
     defm Z128 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, MaskOpNode,
                                     sched.XMM, v8bf16x_info>, EVEX_V128, T_MAP6, PS,
                                     EVEX_CD8<16, CD8VF>;
@@ -1440,9 +1414,8 @@ multiclass vmovrs_p<bits<8> opc, string OpStr, X86VectorVTInfo _> {
 }
 
 multiclass vmovrs_p_vl<bits<8> opc, string OpStr, AVX512VLVectorVTInfo _Vec> {
-  let Predicates = [HasMOVRS, HasAVX10_2_512, In64BitMode] in
-    defm Z : vmovrs_p<opc, OpStr, _Vec.info512>, EVEX_V512;
   let Predicates = [HasMOVRS, HasAVX10_2, In64BitMode] in {
+    defm Z : vmovrs_p<opc, OpStr, _Vec.info512>, EVEX_V512;
     defm Z128 : vmovrs_p<opc, OpStr, _Vec.info128>, EVEX_V128;
     defm Z256 : vmovrs_p<opc, OpStr, _Vec.info256>, EVEX_V256;
   }
@@ -1464,7 +1437,7 @@ multiclass avx10_sm4_base<string OpStr> {
     defm Z128 : SM4_Base<OpStr, VR128X, "128", loadv4i32, i128mem>, EVEX_V128;
     defm Z256 : SM4_Base<OpStr, VR256X, "256", loadv8i32, i256mem>, EVEX_V256;
   }
-  let Predicates = [HasSM4, HasAVX10_2_512] in
+  let Predicates = [HasSM4, HasAVX10_2] in
     defm Z : SM4_Base<OpStr, VR512, "512", loadv16i32, i512mem>, EVEX_V512;
 }
 
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 0ab94cca4142..3401f6f04800 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -834,7 +834,7 @@ defm : vextract_for_size_lowering<"VEXTRACTF64X4Z", v32bf16_info, v16bf16x_info,
 
 // A 128-bit extract from bits [255:128] of a 512-bit vector should use a
 // smaller extract to enable EVEX->VEX.
-let Predicates = [NoVLX, HasEVEX512] in {
+let Predicates = [NoVLX] in {
 def : Pat<(v2i64 (extract_subvector (v8i64 VR512:$src), (iPTR 2))),
           (v2i64 (VEXTRACTI128rri
                   (v4i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_ymm)),
@@ -3088,7 +3088,7 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
            addr:$src2, (X86cmpm_imm_commute timm:$cc)), Narrow.KRC)>;
 }
 
-let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
+let Predicates = [HasAVX512, NoVLX] in {
   defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPD", v8i32x_info, v16i32_info>;
   defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUD", v8i32x_info, v16i32_info>;
 
@@ -3119,7 +3119,7 @@ let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
   defm : axv512_cmp_packed_cc_no_vlx_lowering<"VCMPPD", v2f64x_info, v8f64_info>;
 }
 
-let Predicates = [HasBWI, NoVLX, HasEVEX512] in {
+let Predicates = [HasBWI, NoVLX] in {
   defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPB", v32i8x_info, v64i8_info>;
   defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUB", v32i8x_info, v64i8_info>;
 
@@ -3513,7 +3513,7 @@ multiclass mask_move_lowering<string InstrStr, X86VectorVTInfo Narrow,
 
 // Patterns for handling v8i1 selects of 256-bit vectors when VLX isn't
 // available. Use a 512-bit operation and extract.
-let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
+let Predicates = [HasAVX512, NoVLX] in {
   defm : mask_move_lowering<"VMOVAPSZ", v4f32x_info, v16f32_info>;
   defm : mask_move_lowering<"VMOVDQA32Z", v4i32x_info, v16i32_info>;
   defm : mask_move_lowering<"VMOVAPSZ", v8f32x_info, v16f32_info>;
@@ -3525,7 +3525,7 @@ let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
   defm : mask_move_lowering<"VMOVDQA64Z", v4i64x_info, v8i64_info>;
 }
 
-let Predicates = [HasBWI, NoVLX, HasEVEX512] in {
+let Predicates = [HasBWI, NoVLX] in {
   defm : mask_move_lowering<"VMOVDQU8Z", v16i8x_info, v64i8_info>;
   defm : mask_move_lowering<"VMOVDQU8Z", v32i8x_info, v64i8_info>;
 
@@ -5021,8 +5021,8 @@ defm VPMINUD : avx512_binop_rm_vl_d<0x3B, "vpminud", umin,
 defm VPMINUQ : avx512_binop_rm_vl_q<0x3B, "vpminuq", umin,
                                     SchedWriteVecALU, HasAVX512, 1>, T8;
 
-// PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX, HasEVEX512.
-let Predicates = [HasDQI, NoVLX, HasEVEX512] in {
+// PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX.
+let Predicates = [HasDQI, NoVLX] in {
   def : Pat<(v4i64 (mul (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))),
             (EXTRACT_SUBREG
                 (VPMULLQZrr
@@ -5078,7 +5078,7 @@ multiclass avx512_min_max_lowering<string Instr, SDNode OpNode> {
              sub_xmm)>;
 }
 
-let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
+let Predicates = [HasAVX512, NoVLX] in {
   defm : avx512_min_max_lowering<"VPMAXUQZ", umax>;
   defm : avx512_min_max_lowering<"VPMINUQZ", umin>;
   defm : avx512_min_max_lowering<"VPMAXSQZ", smax>;
@@ -6055,7 +6055,7 @@ defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl,
                                 SchedWriteVecShift>;
 
 // Use 512bit VPSRA/VPSRAI version to implement v2i64/v4i64 in case NoVLX.
-let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
+let Predicates = [HasAVX512, NoVLX] in {
   def : Pat<(v4i64 (X86vsra (v4i64 VR256X:$src1), (v2i64 VR128X:$src2))),
             (EXTRACT_SUBREG (v8i64
               (VPSRAQZrr
@@ -6184,14 +6184,14 @@ defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", X86vsrlv, SchedWriteVarVecS
 defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr, SchedWriteVarVecShift>;
 defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl, SchedWriteVarVecShift>;
 
-defm : avx512_var_shift_lowering<avx512vl_i64_info, "VPSRAVQ", X86vsrav, [HasAVX512, NoVLX, HasEVEX512]>;
-defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSLLVW", X86vshlv, [HasBWI, NoVLX, HasEVEX512]>;
-defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRAVW", X86vsrav, [HasBWI, NoVLX, HasEVEX512]>;
-defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRLVW", X86vsrlv, [HasBWI, NoVLX, HasEVEX512]>;
+defm : avx512_var_shift_lowering<avx512vl_i64_info, "VPSRAVQ", X86vsrav, [HasAVX512, NoVLX]>;
+defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSLLVW", X86vshlv, [HasBWI, NoVLX]>;
+defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRAVW", X86vsrav, [HasBWI, NoVLX]>;
+defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRLVW", X86vsrlv, [HasBWI, NoVLX]>;
 
 
 // Use 512bit VPROL/VPROLI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX.
-let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
+let Predicates = [HasAVX512, NoVLX] in {
   def : Pat<(v2i64 (rotl (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
             (EXTRACT_SUBREG (v8i64
               (VPROLVQZrr
@@ -6242,7 +6242,7 @@ let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
 }
 
 // Use 512bit VPROR/VPRORI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX.
-let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
+let Predicates = [HasAVX512, NoVLX] in {
   def : Pat<(v2i64 (rotr (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
             (EXTRACT_SUBREG (v8i64
               (VPRORVQZrr
@@ -9933,7 +9933,7 @@ defm VPMOVUSWB  : avx512_trunc_wb<0x10, "vpmovuswb", X86vtruncus,
                                   truncstore_us_vi8, masked_truncstore_us_vi8,
                                   X86vtruncus, X86vmtruncus>;
 
-let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
+let Predicates = [HasAVX512, NoVLX] in {
 def: Pat<(v8i16 (trunc (v8i32 VR256X:$src))),
          (v8i16 (EXTRACT_SUBREG
                  (v16i16 (VPMOVDWZrr (v16i32 (INSERT_SUBREG (IMPLICIT_DEF),
@@ -9944,7 +9944,7 @@ def: Pat<(v4i32 (trunc (v4i64 VR256X:$src))),
                                            VR256X:$src, sub_ymm)))), sub_xmm))>;
 }
 
-let Predicates = [HasBWI, NoVLX, HasEVEX512] in {
+let Predicates = [HasBWI, NoVLX] in {
 def: Pat<(v16i8 (trunc (v16i16 VR256X:$src))),
          (v16i8 (EXTRACT_SUBREG (VPMOVWBZrr (v32i16 (INSERT_SUBREG (IMPLICIT_DEF),
                                             VR256X:$src, sub_ymm))), sub_xmm))>;
@@ -10487,7 +10487,7 @@ multiclass avx512_convert_vector_to_mask<bits<8> opc, string OpcodeStr,
     defm Z128 : convert_vector_to_mask_common<opc, VTInfo.info128, OpcodeStr>,
                                                EVEX_V128;
   }
-  let Predicates = [prd, NoVLX, HasEVEX512] in {
+  let Predicates = [prd, NoVLX] in {
     defm Z256_Alt : convert_vector_to_mask_lowering<VTInfo.info512, VTInfo.info256, NAME>;
     defm Z128_Alt : convert_vector_to_mask_lowering<VTInfo.info512, VTInfo.info128, NAME>;
   }
@@ -11283,7 +11283,7 @@ defm VPABS : avx512_unary_rm_vl_all<0x1C, 0x1D, 0x1E, 0x1F, "vpabs", abs,
                                     SchedWriteVecALU>;
 
 // VPABS: Use 512bit version to implement 128/256 bit in case NoVLX.
-let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
+let Predicates = [HasAVX512, NoVLX] in {
   def : Pat<(v4i64 (abs VR256X:$src)),
             (EXTRACT_SUBREG
                 (VPABSQZrr
@@ -11299,7 +11299,7 @@ let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
 // Use 512bit version to implement 128/256 bit.
 multiclass avx512_unary_lowering<string InstrStr, SDNode OpNode,
                                  AVX512VLVectorVTInfo _, Predicate prd> {
-  let Predicates = [prd, NoVLX, HasEVEX512] in {
+  let Predicates = [prd, NoVLX] in {
     def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1))),
               (EXTRACT_SUBREG
                 (!cast<Instruction>(InstrStr # "Zrr")
@@ -11918,7 +11918,7 @@ let Predicates = [HasAVX512] in {
             (VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>;
 }
 
-let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
+let Predicates = [HasAVX512, NoVLX] in {
   def : Pat<(v16i8 (vnot VR128X:$src)),
             (EXTRACT_SUBREG
              (VPTERNLOGQZrri
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index 927b2c8b22f0..5a0df058b27f 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1326,7 +1326,11 @@ def : Pat<(X86imp_call (i64 tglobaladdr:$dst)),
 // Match an X86tcret that uses less than 7 volatile registers.
 def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
           (TCRETURNri ptr_rc_tailcall:$dst, timm:$off)>,
-          Requires<[Not64BitMode, NotUseIndirectThunkCalls]>;
+          Requires<[Not64BitMode, IsNotHiPECCFunc, NotUseIndirectThunkCalls]>;
+
+def : Pat<(X86tcret GR32:$dst, timm:$off),
+          (TCRETURN_HIPE32ri GR32:$dst, timm:$off)>,
+          Requires<[Not64BitMode, IsHiPECCFunc, NotUseIndirectThunkCalls]>;
 
 // FIXME: This is disabled for 32-bit PIC mode because the global base
 // register which is part of the address mode may be assigned a
@@ -1346,7 +1350,11 @@ def : Pat<(X86tcret (i32 texternalsym:$dst), timm:$off),
 
 def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
           (TCRETURNri64 ptr_rc_tailcall:$dst, timm:$off)>,
-          Requires<[In64BitMode, NotUseIndirectThunkCalls, ImportCallOptimizationDisabled]>;
+          Requires<[In64BitMode, IsNotWin64CCFunc, NotUseIndirectThunkCalls, ImportCallOptimizationDisabled]>;
+
+def : Pat<(X86tcret GR64_TCW64:$dst, timm:$off),
+          (TCRETURN_WIN64ri GR64_TCW64:$dst, timm:$off)>,
+          Requires<[IsWin64CCFunc, NotUseIndirectThunkCalls, ImportCallOptimizationDisabled]>;
 
 def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
           (TCRETURNri64_ImpCall ptr_rc_tailcall:$dst, timm:$off)>,
diff --git a/llvm/lib/Target/X86/X86InstrControl.td b/llvm/lib/Target/X86/X86InstrControl.td
index 22253bf0413a..139aedd473eb 100644
--- a/llvm/lib/Target/X86/X86InstrControl.td
+++ b/llvm/lib/Target/X86/X86InstrControl.td
@@ -282,6 +282,10 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
                            []>, Sched<[WriteJump]>;
   def TCRETURNri : PseudoI<(outs), (ins ptr_rc_tailcall:$dst, i32imm:$offset),
                            []>, Sched<[WriteJump]>;
+
+  def TCRETURN_HIPE32ri : PseudoI<(outs), (ins GR32:$dst, i32imm:$offset),
+                                  []>, Sched<[WriteJump]>;
+
   let mayLoad = 1 in
   def TCRETURNmi : PseudoI<(outs), (ins i32mem_TC:$dst, i32imm:$offset),
                            []>, Sched<[WriteJumpLd]>;
@@ -357,6 +361,9 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
   def TCRETURNri64   : PseudoI<(outs),
                                (ins ptr_rc_tailcall:$dst, i32imm:$offset),
                                []>, Sched<[WriteJump]>;
+  def TCRETURN_WIN64ri : PseudoI<(outs), (ins GR64_TCW64:$dst, i32imm:$offset),
+                                []>, Sched<[WriteJump]>;
+
   def TCRETURNri64_ImpCall   : PseudoI<(outs),
                                (ins GR64_A:$dst, i32imm:$offset),
                                []>, Sched<[WriteJump]>;
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index abf365eedec3..a68edf4d2b7e 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -83,8 +83,9 @@ static cl::opt<unsigned> UndefRegClearance(
 // Pin the vtable to this file.
 void X86InstrInfo::anchor() {}
 
-X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
-    : X86GenInstrInfo((STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64
+X86InstrInfo::X86InstrInfo(const X86Subtarget &STI)
+    : X86GenInstrInfo(STI,
+                      (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64
                                                : X86::ADJCALLSTACKDOWN32),
                       (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64
                                                : X86::ADJCALLSTACKUP32),
@@ -4399,13 +4400,8 @@ static unsigned getLoadStoreOpcodeForFP16(bool Load, const X86Subtarget &STI) {
   if (STI.hasFP16())
     return Load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr;
   if (Load)
-    return STI.hasAVX512() ? X86::VMOVSSZrm
-           : STI.hasAVX()  ? X86::VMOVSSrm
-                           : X86::MOVSSrm;
-  else
-    return STI.hasAVX512() ? X86::VMOVSSZmr
-           : STI.hasAVX()  ? X86::VMOVSSmr
-                           : X86::MOVSSmr;
+    return X86::MOVSHPrm;
+  return X86::MOVSHPmr;
 }
 
 static unsigned getLoadStoreRegOpcode(Register Reg,
@@ -4903,6 +4899,16 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
     CmpMask = ~0;
     CmpValue = 0;
     return true;
+  case X86::TEST64ri32:
+  case X86::TEST32ri:
+  case X86::TEST16ri:
+  case X86::TEST8ri:
+    SrcReg = MI.getOperand(0).getReg();
+    SrcReg2 = 0;
+    // Force identical compare.
+    CmpMask = 0;
+    CmpValue = 0;
+    return true;
   }
   return false;
 }
@@ -4942,6 +4948,10 @@ bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI,
   case X86::CMP32ri:
   case X86::CMP16ri:
   case X86::CMP8ri:
+  case X86::TEST64ri32:
+  case X86::TEST32ri:
+  case X86::TEST16ri:
+  case X86::TEST8ri:
   CASE_ND(SUB64ri32)
   CASE_ND(SUB32ri)
   CASE_ND(SUB16ri)
@@ -6131,6 +6141,25 @@ static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) {
   return true;
 }
 
+static bool expandMOVSHP(MachineInstrBuilder &MIB, MachineInstr &MI,
+                         const TargetInstrInfo &TII, bool HasAVX) {
+  unsigned NewOpc;
+  if (MI.getOpcode() == X86::MOVSHPrm) {
+    NewOpc = HasAVX ? X86::VMOVSSrm : X86::MOVSSrm;
+    Register Reg = MI.getOperand(0).getReg();
+    if (Reg > X86::XMM15)
+      NewOpc = X86::VMOVSSZrm;
+  } else {
+    NewOpc = HasAVX ? X86::VMOVSSmr : X86::MOVSSmr;
+    Register Reg = MI.getOperand(5).getReg();
+    if (Reg > X86::XMM15)
+      NewOpc = X86::VMOVSSZmr;
+  }
+
+  MIB->setDesc(TII.get(NewOpc));
+  return true;
+}
+
 bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   bool HasAVX = Subtarget.hasAVX();
   MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
@@ -6203,6 +6232,9 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     }
     return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
   }
+  case X86::MOVSHPmr:
+  case X86::MOVSHPrm:
+    return expandMOVSHP(MIB, MI, *this, Subtarget.hasAVX());
   case X86::V_SETALLONES:
     return Expand2AddrUndef(MIB,
                             get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 9dc5f4b0e086..f087b7f20ff6 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -222,7 +222,7 @@ inline static bool isMemInstrWithGOTPCREL(const MachineInstr &MI) {
 }
 
 class X86InstrInfo final : public X86GenInstrInfo {
-  X86Subtarget &Subtarget;
+  const X86Subtarget &Subtarget;
   const X86RegisterInfo RI;
 
   LLVM_DECLARE_VIRTUAL_ANCHOR_FUNCTION();
@@ -238,7 +238,7 @@ class X86InstrInfo final : public X86GenInstrInfo {
                          bool MakeChange) const;
 
 public:
-  explicit X86InstrInfo(X86Subtarget &STI);
+  explicit X86InstrInfo(const X86Subtarget &STI);
 
   /// Given a machine instruction descriptor, returns the register
   /// class constraint for OpNum, or NULL. Returned register class
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index df1541e9085b..8339c2081842 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -69,11 +69,8 @@ def NoAVX        : Predicate<"!Subtarget->hasAVX()">;
 def HasAVX       : Predicate<"Subtarget->hasAVX()">;
 def HasAVX2      : Predicate<"Subtarget->hasAVX2()">;
 def HasAVX1Only  : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">;
-def HasEVEX512   : Predicate<"Subtarget->hasEVEX512()">;
 def HasAVX10_1   : Predicate<"Subtarget->hasAVX10_1()">;
-def HasAVX10_1_512 : Predicate<"Subtarget->hasAVX10_1_512()">;
 def HasAVX10_2   : Predicate<"Subtarget->hasAVX10_2()">;
-def HasAVX10_2_512 : Predicate<"Subtarget->hasAVX10_2_512()">;
 def NoAVX10_2    : Predicate<"!Subtarget->hasAVX10_2()">;
 def HasAVX512    : Predicate<"Subtarget->hasAVX512()">;
 def UseAVX       : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX512()">;
@@ -233,6 +230,13 @@ let RecomputePerFunction = 1 in {
                                         "!Subtarget->hasSSE41()">;
   def ImportCallOptimizationEnabled : Predicate<"MF->getFunction().getParent()->getModuleFlag(\"import-call-optimization\")">;
   def ImportCallOptimizationDisabled : Predicate<"!MF->getFunction().getParent()->getModuleFlag(\"import-call-optimization\")">;
+
+  def IsWin64CCFunc : Predicate<"Subtarget->isCallingConvWin64(MF->getFunction().getCallingConv())">;
+  def IsNotWin64CCFunc : Predicate<"!Subtarget->isCallingConvWin64(MF->getFunction().getCallingConv())">;
+  def IsHiPECCFunc : Predicate<"MF->getFunction().getCallingConv() == CallingConv::HiPE">;
+
+  def IsNotHiPECCFunc : Predicate<
+    "MF->getFunction().getCallingConv() != CallingConv::HiPE">;
 }
 
 def CallImmAddr  : Predicate<"Subtarget->isLegalToCallImmediateAddr()">;
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 1acc0cd8da20..b7926497c92b 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -267,6 +267,18 @@ multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop,
   }
 }
 
+// pseudo instruction for fp16 spilling.
+let isPseudo = 1, Predicates = [HasSSE2] in {
+  let mayStore = 1 in
+  def MOVSHPmr : I<0, Pseudo, (outs), (ins f32mem:$dst, FR16X:$src), "",
+                   [], SSEPackedSingle>,
+                   Sched<[WriteFStore]>;
+  let  mayLoad = 1 in
+  def MOVSHPrm : I<0, Pseudo, (outs FR16X:$dst), (ins f32mem:$src), "",
+                   [], SSEPackedSingle>,
+                   Sched<[WriteFLoad]>;
+}
+
 defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
                         SSEPackedSingle, UseSSE1>, TB, XS;
 defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
index 632db7e4326e..4188487d7591 100644
--- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -825,7 +825,8 @@ bool X86TargetLowering::lowerInterleavedLoad(
 bool X86TargetLowering::lowerInterleavedStore(Instruction *Store,
                                               Value *LaneMask,
                                               ShuffleVectorInst *SVI,
-                                              unsigned Factor) const {
+                                              unsigned Factor,
+                                              const APInt &GapMask) const {
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
 
@@ -836,7 +837,8 @@ bool X86TargetLowering::lowerInterleavedStore(Instruction *Store,
   auto *SI = dyn_cast<StoreInst>(Store);
   if (!SI)
     return false;
-  assert(!LaneMask && "Unexpected mask on store");
+  assert(!LaneMask && GapMask.popcount() == Factor &&
+         "Unexpected mask on store");
 
   // Holds the indices of SVI that correspond to the starting index of each
   // interleaved shuffle.
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index 595ad3290eed..9ec04e740a08 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -204,15 +204,7 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF,
     // we can still use 64-bit register as long as we know the high bits
     // are zeros.
     // Reflect that in the returned register class.
-    if (Is64Bit) {
-      // When the target also allows 64-bit frame pointer and we do have a
-      // frame, this is fine to use it for the address accesses as well.
-      const X86FrameLowering *TFI = getFrameLowering(MF);
-      return TFI->hasFP(MF) && TFI->Uses64BitFramePtr
-                 ? &X86::LOW32_ADDR_ACCESS_RBPRegClass
-                 : &X86::LOW32_ADDR_ACCESSRegClass;
-    }
-    return &X86::GR32RegClass;
+    return Is64Bit ? &X86::LOW32_ADDR_ACCESSRegClass : &X86::GR32RegClass;
   case 1: // Normal GPRs except the stack pointer (for encoding reasons).
     if (Subtarget.isTarget64BitLP64())
       return &X86::GR64_NOSPRegClass;
@@ -228,25 +220,11 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF,
     // NOSP does not contain RIP, so no special case here.
     return &X86::GR32_NOREX_NOSPRegClass;
   case 4: // Available for tailcall (not callee-saved GPRs).
-    return getGPRsForTailCall(MF);
+    return Is64Bit ? &X86::GR64_TCRegClass : &X86::GR32_TCRegClass;
   }
 }
 
 const TargetRegisterClass *
-X86RegisterInfo::getGPRsForTailCall(const MachineFunction &MF) const {
-  const Function &F = MF.getFunction();
-  if (IsWin64 || IsUEFI64 || (F.getCallingConv() == CallingConv::Win64))
-    return &X86::GR64_TCW64RegClass;
-  else if (Is64Bit)
-    return &X86::GR64_TCRegClass;
-
-  bool hasHipeCC = (F.getCallingConv() == CallingConv::HiPE);
-  if (hasHipeCC)
-    return &X86::GR32RegClass;
-  return &X86::GR32_TCRegClass;
-}
-
-const TargetRegisterClass *
 X86RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
   if (RC == &X86::CCRRegClass) {
     if (Is64Bit)
@@ -1007,11 +985,10 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 unsigned X86RegisterInfo::findDeadCallerSavedReg(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI) const {
   const MachineFunction *MF = MBB.getParent();
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
   if (MF->callsEHReturn())
     return 0;
 
-  const TargetRegisterClass &AvailableRegs = *getGPRsForTailCall(*MF);
-
   if (MBBI == MBB.end())
     return 0;
 
@@ -1026,6 +1003,8 @@ unsigned X86RegisterInfo::findDeadCallerSavedReg(
   case X86::RETI64:
   case X86::TCRETURNdi:
   case X86::TCRETURNri:
+  case X86::TCRETURN_WIN64ri:
+  case X86::TCRETURN_HIPE32ri:
   case X86::TCRETURNmi:
   case X86::TCRETURNdi64:
   case X86::TCRETURNri64:
@@ -1033,20 +1012,16 @@ unsigned X86RegisterInfo::findDeadCallerSavedReg(
   case X86::TCRETURNmi64:
   case X86::EH_RETURN:
   case X86::EH_RETURN64: {
-    SmallSet<uint16_t, 8> Uses;
-    for (MachineOperand &MO : MBBI->operands()) {
-      if (!MO.isReg() || MO.isDef())
-        continue;
-      Register Reg = MO.getReg();
-      if (!Reg)
-        continue;
-      for (MCRegAliasIterator AI(Reg, this, true); AI.isValid(); ++AI)
-        Uses.insert(*AI);
+    LiveRegUnits LRU(*this);
+    LRU.addLiveOuts(MBB);
+    LRU.stepBackward(*MBBI);
+
+    const TargetRegisterClass &RC =
+        Is64Bit ? X86::GR64_NOSPRegClass : X86::GR32_NOSPRegClass;
+    for (MCRegister Reg : RC) {
+      if (LRU.available(Reg) && !MRI.isReserved(Reg))
+        return Reg;
     }
-
-    for (auto CS : AvailableRegs)
-      if (!Uses.count(CS) && CS != X86::RIP && CS != X86::RSP && CS != X86::ESP)
-        return CS;
   }
   }
 
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.h b/llvm/lib/Target/X86/X86RegisterInfo.h
index 2f4c55cfad6d..d022e5ab8794 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.h
+++ b/llvm/lib/Target/X86/X86RegisterInfo.h
@@ -87,11 +87,6 @@ public:
   const TargetRegisterClass *
   getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
 
-  /// getGPRsForTailCall - Returns a register class with registers that can be
-  /// used in forming tail calls.
-  const TargetRegisterClass *
-  getGPRsForTailCall(const MachineFunction &MF) const;
-
   unsigned getRegPressureLimit(const TargetRegisterClass *RC,
                                MachineFunction &MF) const override;
 
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td
index e9ca25d808a5..99b7910131dc 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -716,10 +716,7 @@ def GR64_NOREX2_NOSP : RegisterClass<"X86", [i64], 64,
 // which we do not have right now.
 def LOW32_ADDR_ACCESS : RegisterClass<"X86", [i32], 32, (add GR32, RIP)>;
 
-// When RBP is used as a base pointer in a 32-bit addresses environment,
-// this is also safe to use the full register to access addresses.
-// Since RBP will never be spilled, stick to a 32 alignment to save
-// on memory consumption.
+// FIXME: This is unused, but deleting it results in codegen changes
 def LOW32_ADDR_ACCESS_RBP : RegisterClass<"X86", [i32], 32,
                                           (add LOW32_ADDR_ACCESS, RBP)>;
 
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver3.td b/llvm/lib/Target/X86/X86ScheduleZnver3.td
index 9e271c1ee370..044b77f7aacf 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver3.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver3.td
@@ -992,14 +992,14 @@ def Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr : SchedWriteRes<[Zn3FPFMisc0]> {
 def : InstRW<[Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr], (instrs VEXTRACTF128rri, VEXTRACTI128rri)>;
 
 def Zn3WriteVEXTRACTI128mr : SchedWriteRes<[Zn3FPFMisc0, Zn3FPSt, Zn3Store]> {
-  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
+  let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
   let ReleaseAtCycles = [1, 1, 1];
   let NumMicroOps = !add(Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 1);
 }
 def : InstRW<[Zn3WriteVEXTRACTI128mr], (instrs VEXTRACTI128mri, VEXTRACTF128mri)>;
 
 def Zn3WriteVINSERTF128rmr : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPFMisc0]> {
-  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
+  let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
   let ReleaseAtCycles = [1, 1, 1];
   let NumMicroOps = !add(Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 0);
 }
@@ -1221,7 +1221,7 @@ def Zn3WriteSHA1MSG1rr : SchedWriteRes<[Zn3FPU0123]> {
 def : InstRW<[Zn3WriteSHA1MSG1rr], (instrs SHA1MSG1rr)>;
 
 def Zn3WriteSHA1MSG1rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
-  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA1MSG1rr.Latency);
+  let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteSHA1MSG1rr.Latency);
   let ReleaseAtCycles = [1, 1, 2];
   let NumMicroOps = !add(Zn3WriteSHA1MSG1rr.NumMicroOps, 0);
 }
@@ -1235,7 +1235,7 @@ def Zn3WriteSHA1MSG2rr_SHA1NEXTErr : SchedWriteRes<[Zn3FPU0123]> {
 def : InstRW<[Zn3WriteSHA1MSG2rr_SHA1NEXTErr], (instrs SHA1MSG2rr, SHA1NEXTErr)>;
 
 def Zn3Writerm_SHA1MSG2rm_SHA1NEXTErm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
-  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA1MSG2rr_SHA1NEXTErr.Latency);
+  let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteSHA1MSG2rr_SHA1NEXTErr.Latency);
   let ReleaseAtCycles = [1, 1, 2];
   let NumMicroOps = !add(Zn3WriteSHA1MSG2rr_SHA1NEXTErr.NumMicroOps, 0);
 }
@@ -1249,7 +1249,7 @@ def Zn3WriteSHA256MSG1rr : SchedWriteRes<[Zn3FPU0123]> {
 def : InstRW<[Zn3WriteSHA256MSG1rr], (instrs SHA256MSG1rr)>;
 
 def Zn3Writerm_SHA256MSG1rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
-  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA256MSG1rr.Latency);
+  let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteSHA256MSG1rr.Latency);
   let ReleaseAtCycles = [1, 1, 3];
   let NumMicroOps = !add(Zn3WriteSHA256MSG1rr.NumMicroOps, 0);
 }
@@ -1263,7 +1263,7 @@ def Zn3WriteSHA256MSG2rr : SchedWriteRes<[Zn3FPU0123]> {
 def : InstRW<[Zn3WriteSHA256MSG2rr], (instrs SHA256MSG2rr)>;
 
 def Zn3WriteSHA256MSG2rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
-  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA256MSG2rr.Latency);
+  let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteSHA256MSG2rr.Latency);
   let ReleaseAtCycles = [1, 1, 8];
   let NumMicroOps = !add(Zn3WriteSHA256MSG2rr.NumMicroOps, 1);
 }
@@ -1338,14 +1338,14 @@ def Zn3WriteVPERM2I128rr_VPERM2F128rr : SchedWriteRes<[Zn3FPVShuf]> {
 def : InstRW<[Zn3WriteVPERM2I128rr_VPERM2F128rr], (instrs VPERM2I128rri, VPERM2F128rri)>;
 
 def Zn3WriteVPERM2F128rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
-  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERM2I128rr_VPERM2F128rr.Latency);
+  let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteVPERM2I128rr_VPERM2F128rr.Latency);
   let ReleaseAtCycles = [1, 1, 1];
   let NumMicroOps = !add(Zn3WriteVPERM2I128rr_VPERM2F128rr.NumMicroOps, 0);
 }
 def : InstRW<[Zn3WriteVPERM2F128rm], (instrs VPERM2F128rmi)>;
 
 def Zn3WriteVPERMPSYrm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
-  let Latency = !add(Znver3Model.LoadLatency, 7);
+  let Latency = !add(Znver3Model.VecLoadLatency, 7);
   let ReleaseAtCycles = [1, 1, 2];
   let NumMicroOps = 3;
 }
@@ -1359,14 +1359,14 @@ def Zn3WriteVPERMYri : SchedWriteRes<[Zn3FPVShuf]> {
 def : InstRW<[Zn3WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>;
 
 def Zn3WriteVPERMPDYmi : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
-  let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERMYri.Latency);
+  let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteVPERMYri.Latency);
   let ReleaseAtCycles = [1, 1, 2];
   let NumMicroOps = !add(Zn3WriteVPERMYri.NumMicroOps, 1);
 }
 def : InstRW<[Zn3WriteVPERMPDYmi], (instrs VPERMPDYmi)>;
 
 def Zn3WriteVPERMDYm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
-  let Latency = !add(Znver3Model.LoadLatency, 5);
+  let Latency = !add(Znver3Model.VecLoadLatency, 5);
   let ReleaseAtCycles = [1, 1, 2];
   let NumMicroOps = 2;
 }
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td
index 74d916d41f83..a93c7e3a82f1 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver4.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td
@@ -1005,14 +1005,14 @@ def Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr : SchedWriteRes<[Zn4FPFMisc0]> {
 def : InstRW<[Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr], (instrs VEXTRACTF128rri, VEXTRACTI128rri)>;
 
 def Zn4WriteVEXTRACTI128mr : SchedWriteRes<[Zn4FPFMisc0, Zn4FPSt, Zn4Store]> {
-  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
+  let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
   let ReleaseAtCycles = [1, 1, 1];
   let NumMicroOps = !add(Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 1);
 }
 def : InstRW<[Zn4WriteVEXTRACTI128mr], (instrs VEXTRACTI128mri, VEXTRACTF128mri)>;
 
 def Zn4WriteVINSERTF128rmr : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPFMisc0]> {
-  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
+  let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
   let ReleaseAtCycles = [1, 1, 1];
   let NumMicroOps = !add(Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 0);
 }
@@ -1262,7 +1262,7 @@ def Zn4WriteSHA1MSG1rr : SchedWriteRes<[Zn4FPU0123]> {
 def : InstRW<[Zn4WriteSHA1MSG1rr], (instrs SHA1MSG1rr)>;
 
 def Zn4WriteSHA1MSG1rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
-  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA1MSG1rr.Latency);
+  let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteSHA1MSG1rr.Latency);
   let ReleaseAtCycles = [1, 1, 2];
   let NumMicroOps = !add(Zn4WriteSHA1MSG1rr.NumMicroOps, 0);
 }
@@ -1276,7 +1276,7 @@ def Zn4WriteSHA1MSG2rr_SHA1NEXTErr : SchedWriteRes<[Zn4FPU0123]> {
 def : InstRW<[Zn4WriteSHA1MSG2rr_SHA1NEXTErr], (instrs SHA1MSG2rr, SHA1NEXTErr)>;
 
 def Zn4Writerm_SHA1MSG2rm_SHA1NEXTErm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
-  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA1MSG2rr_SHA1NEXTErr.Latency);
+  let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteSHA1MSG2rr_SHA1NEXTErr.Latency);
   let ReleaseAtCycles = [1, 1, 2];
   let NumMicroOps = !add(Zn4WriteSHA1MSG2rr_SHA1NEXTErr.NumMicroOps, 0);
 }
@@ -1290,7 +1290,7 @@ def Zn4WriteSHA256MSG1rr : SchedWriteRes<[Zn4FPU0123]> {
 def : InstRW<[Zn4WriteSHA256MSG1rr], (instrs SHA256MSG1rr)>;
 
 def Zn4Writerm_SHA256MSG1rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
-  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA256MSG1rr.Latency);
+  let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteSHA256MSG1rr.Latency);
   let ReleaseAtCycles = [1, 1, 3];
   let NumMicroOps = !add(Zn4WriteSHA256MSG1rr.NumMicroOps, 0);
 }
@@ -1304,7 +1304,7 @@ def Zn4WriteSHA256MSG2rr : SchedWriteRes<[Zn4FPU0123]> {
 def : InstRW<[Zn4WriteSHA256MSG2rr], (instrs SHA256MSG2rr)>;
 
 def Zn4WriteSHA256MSG2rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
-  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA256MSG2rr.Latency);
+  let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteSHA256MSG2rr.Latency);
   let ReleaseAtCycles = [1, 1, 8];
   let NumMicroOps = !add(Zn4WriteSHA256MSG2rr.NumMicroOps, 1);
 }
@@ -1379,7 +1379,7 @@ def Zn4WriteVPERM2I128rr_VPERM2F128rr : SchedWriteRes<[Zn4FPVShuf]> {
 def : InstRW<[Zn4WriteVPERM2I128rr_VPERM2F128rr], (instrs VPERM2I128rri, VPERM2F128rri)>;
 
 def Zn4WriteVPERM2F128rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
-  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERM2I128rr_VPERM2F128rr.Latency);
+  let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERM2I128rr_VPERM2F128rr.Latency);
   let ReleaseAtCycles = [1, 1, 1];
   let NumMicroOps = !add(Zn4WriteVPERM2I128rr_VPERM2F128rr.NumMicroOps, 0);
 }
@@ -1393,7 +1393,7 @@ def Zn4WriteVPERMPSYrr : SchedWriteRes<[Zn4FPVShuf]> {
 def : InstRW<[Zn4WriteVPERMPSYrr], (instrs VPERMPSYrr)>;
 
 def Zn4WriteVPERMPSYrm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
-  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMPSYrr.Latency);
+  let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMPSYrr.Latency);
   let ReleaseAtCycles = [1, 1, 2];
   let NumMicroOps = !add(Zn4WriteVPERMPSYrr.NumMicroOps, 1);
 }
@@ -1407,7 +1407,7 @@ def Zn4WriteVPERMYri : SchedWriteRes<[Zn4FPVShuf]> {
 def : InstRW<[Zn4WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>;
 
 def Zn4WriteVPERMPDYmi : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
-  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMYri.Latency);
+  let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMYri.Latency);
   let ReleaseAtCycles = [1, 1, 2];
   let NumMicroOps = !add(Zn4WriteVPERMYri.NumMicroOps, 1);
 }
@@ -1421,7 +1421,7 @@ def Zn4WriteVPERMDYrr : SchedWriteRes<[Zn4FPVShuf]> {
 def : InstRW<[Zn4WriteVPERMDYrr], (instrs VPERMDYrr)>;
 
 def Zn4WriteVPERMYm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
-  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMDYrr.Latency);
+  let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMDYrr.Latency);
   let ReleaseAtCycles = [1, 1, 2];
   let NumMicroOps = !add(Zn4WriteVPERMDYrr.NumMicroOps, 0);
 }
@@ -1534,9 +1534,9 @@ def Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr : SchedWriteRes<[Zn4FPFMisc01]> {
   let NumMicroOps = 1;
 }
 def : InstRW<[Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr], (instregex
-	"VFIXUPIMM(S|P)(S|D)(Z|Z128|Z256?)rrik", "VFIXUPIMM(S|P)(S|D)(Z?|Z128?|Z256?)rrikz", 
+        "VFIXUPIMM(S|P)(S|D)(Z|Z128|Z256?)rrik", "VFIXUPIMM(S|P)(S|D)(Z?|Z128?|Z256?)rrikz", 
         "VFIXUPIMM(S|P)(S|D)(Z128|Z256?)rri",  "VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)",
-	"VRANGE(S|P)(S|D)(Z|Z128|Z256?)rri(b?)k","VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)kz"
+        "VRANGE(S|P)(S|D)(Z|Z128|Z256?)rri(b?)k","VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)kz"
 	)>;
 
 // SCALE & REDUCE instructions
@@ -1567,7 +1567,7 @@ def Zn4WriteBUSDr_VPMADDr: SchedWriteRes<[Zn4FPFMisc01]> {
   let NumMicroOps = 1;
 }
 def : InstRW<[Zn4WriteBUSDr_VPMADDr], (instregex
-	"VPDP(BU|WS)(S|P)(S|D|DS)(Z|Z128|Z256)(r|rk|rkz)",
+        "VPDP(BU|WS)(S|P)(S|D|DS)(Z|Z128|Z256)(r|rk|rkz)",
         "VPMADD52(H|L)UQ(Z|Z128|Z256)(r|rk|rkz)"
 	)>;
 
@@ -1586,7 +1586,7 @@ def : InstRW<[Zn4WriteSHIFTrr], (instregex
         "(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z?|Z128?|Z256?)(rr|rrk|rrkz)",
         "(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z256?)(ri|rik|rikz)",
         "(V?)P(ROL|ROR)(D|Q)(Z?|Z128?)(ri|rik|rikz)",
-	"VPSHUFBITQMBZ128rr", "VFMSUB231SSZrkz_Int"
+        "VPSHUFBITQMBZ128rr", "VFMSUB231SSZrkz_Int"
 	)>;
 
 def Zn4WriteSHIFTri: SchedWriteRes<[Zn4FPFMisc01]> {
@@ -1598,24 +1598,40 @@ def : InstRW<[Zn4WriteSHIFTri], (instregex
         "VP(SLL|SRL|SRA)(D|Q|W)(Z|Z128|Z256?)(ri|rik|rikz)"
 	)>;
 
-// ALIGN Instructions
-def Zn4WriteALIGN: SchedWriteRes<[Zn4FPFMisc12]> {
+// ALIGNR Instructions
+def Zn4WriteALIGNR: SchedWriteRes<[Zn4FPFMisc12]> {
+  let Latency = 2;
+  let ReleaseAtCycles = [1];
+  let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WriteALIGNR], (instregex
+        "(V?)PALIGNR(Y?|Z128?|Z256?)(rri|rrik|rrikz)"
+	)>;
+def Zn4WriteALIGNRZ: SchedWriteRes<[Zn4FPFMisc12]> {
   let Latency = 2;
   let ReleaseAtCycles = [2];
   let NumMicroOps = 1;
 }
-def : InstRW<[Zn4WriteALIGN], (instregex
-        "(V?)PALIGNR(Z?|Z128?|Z256?)(rri|rrik|rrikz)"
+def : InstRW<[Zn4WriteALIGNRZ], (instregex
+        "(V?)PALIGNRZ(rri|rrik|rrikz)"
 	)>;
 
-//PACK Instructions
+// PACK Instructions
 def Zn4WritePACK: SchedWriteRes<[Zn4FPFMisc12]> {
   let Latency = 2;
-  let ReleaseAtCycles = [2];
+  let ReleaseAtCycles = [1];
   let NumMicroOps = 1;
 }
 def : InstRW<[Zn4WritePACK], (instregex
-        "(V?)PACK(SS|US)(DW|WB)(Z?|Z128?|Z256?)(rr|rrk|rrkz)"
+        "(V?)PACK(SS|US)(DW|WB)(Y?|Z128?|Z256?)(rr|rrk|rrkz)"
+	)>;
+def Zn4WritePACKZ: SchedWriteRes<[Zn4FPFMisc12]> {
+  let Latency = 2;
+  let ReleaseAtCycles = [2];
+  let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WritePACKZ], (instregex
+        "(V?)PACK(SS|US)(DW|WB)Z(rr|rrk|rrkz)"
 	)>;
 
 // MAX and MIN Instructions
diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp
index 8ad8d423d10c..fd5f34b60efb 100644
--- a/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -261,26 +261,8 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU,
   if (!FS.empty())
     FullFS = (Twine(FullFS) + "," + FS).str();
 
-  // Attach EVEX512 feature when we have AVX512 features with a default CPU.
-  // "pentium4" is default CPU for 32-bit targets.
-  // "x86-64" is default CPU for 64-bit targets.
-  if (CPU == "generic" || CPU == "pentium4" || CPU == "x86-64") {
-    size_t posNoEVEX512 = FS.rfind("-evex512");
-    // Make sure we won't be cheated by "-avx512fp16".
-    size_t posNoAVX512F =
-        FS.ends_with("-avx512f") ? FS.size() - 8 : FS.rfind("-avx512f,");
-    size_t posEVEX512 = FS.rfind("+evex512");
-    // Any AVX512XXX will enable AVX512F.
-    size_t posAVX512F = FS.rfind("+avx512");
-
-    if (posAVX512F != StringRef::npos &&
-        (posNoAVX512F == StringRef::npos || posNoAVX512F < posAVX512F))
-      if (posEVEX512 == StringRef::npos && posNoEVEX512 == StringRef::npos)
-        FullFS += ",+evex512";
-  }
-
   // Disable 64-bit only features in non-64-bit mode.
-  SmallVector<StringRef, 9> FeaturesIn64BitOnly = {
+  StringRef FeaturesIn64BitOnly[] = {
       "egpr", "push2pop2", "ppx", "ndd", "ccmp", "nf", "cf", "zu", "uintr"};
   if (FullFS.find("-64bit-mode") != std::string::npos)
     for (StringRef F : FeaturesIn64BitOnly)
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index be49214e041e..fa3f3b59741d 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -226,8 +226,7 @@ public:
   // TODO: Currently we're always allowing widening on CPUs without VLX,
   // because for many cases we don't have a better option.
   bool canExtendTo512DQ() const {
-    return hasAVX512() && hasEVEX512() &&
-           (!hasVLX() || getPreferVectorWidth() >= 512);
+    return hasAVX512() && (!hasVLX() || getPreferVectorWidth() >= 512);
   }
   bool canExtendTo512BW() const  {
     return hasBWI() && canExtendTo512DQ();
@@ -247,8 +246,7 @@ public:
   // If there are no 512-bit vectors and we prefer not to use 512-bit registers,
   // disable them in the legalizer.
   bool useAVX512Regs() const {
-    return hasAVX512() && hasEVEX512() &&
-           (canExtendTo512DQ() || RequiredVectorWidth > 256);
+    return hasAVX512() && (canExtendTo512DQ() || RequiredVectorWidth > 256);
   }
 
   bool useLight256BitInstructions() const {
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 62f95277d016..3d8d0a236a3c 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -213,7 +213,7 @@ X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
   case TargetTransformInfo::RGK_Scalar:
     return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
   case TargetTransformInfo::RGK_FixedWidthVector:
-    if (ST->hasAVX512() && ST->hasEVEX512() && PreferVectorWidth >= 512)
+    if (ST->hasAVX512() && PreferVectorWidth >= 512)
       return TypeSize::getFixed(512);
     if (ST->hasAVX() && PreferVectorWidth >= 256)
       return TypeSize::getFixed(256);
@@ -1206,6 +1206,8 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
     { ISD::MUL,     MVT::v4i32,   {  2,  5,  1,  3 } }, // pmulld
     { ISD::MUL,     MVT::v4i64,   { 12, 15, 19, 20 } },
 
+    { X86ISD::PMULUDQ, MVT::v4i64, { 3,  5, 5, 6 } }, // pmuludq + split
+
     { ISD::AND,     MVT::v32i8,   {  1,  1, 1, 2 } }, // vandps
     { ISD::AND,     MVT::v16i16,  {  1,  1, 1, 2 } }, // vandps
     { ISD::AND,     MVT::v8i32,   {  1,  1, 1, 2 } }, // vandps
@@ -6591,7 +6593,7 @@ X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
     // Only enable vector loads for equality comparison. Right now the vector
     // version is not as fast for three way compare (see #33329).
     const unsigned PreferredWidth = ST->getPreferVectorWidth();
-    if (PreferredWidth >= 512 && ST->hasAVX512() && ST->hasEVEX512())
+    if (PreferredWidth >= 512 && ST->hasAVX512())
       Options.LoadSizes.push_back(64);
     if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
     if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
diff --git a/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp b/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp
index ea8b88f41bb8..9bf0abb018c9 100644
--- a/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp
+++ b/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp
@@ -105,6 +105,7 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
   // Prolog information.
   SmallVector<int64_t> PushedRegs;
   bool HasStackAlloc = false;
+  bool HasSetFrame = false;
   unsigned ApproximatePrologCodeCount = 0;
 
   // Requested changes.
@@ -130,15 +131,20 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
         break;
 
       case X86::SEH_StackAlloc:
-      case X86::SEH_SetFrame:
         if (State != FunctionState::InProlog)
-          llvm_unreachable("SEH_StackAlloc or SEH_SetFrame outside of prolog");
+          llvm_unreachable("SEH_StackAlloc outside of prolog");
         // Assume a large alloc...
-        ApproximatePrologCodeCount +=
-            (MI.getOpcode() == X86::SEH_StackAlloc) ? 3 : 1;
+        ApproximatePrologCodeCount += 3;
         HasStackAlloc = true;
         break;
 
+      case X86::SEH_SetFrame:
+        if (State != FunctionState::InProlog)
+          llvm_unreachable("SEH_SetFrame outside of prolog");
+        ApproximatePrologCodeCount++;
+        HasSetFrame = true;
+        break;
+
       case X86::SEH_SaveReg:
       case X86::SEH_SaveXMM:
         if (State != FunctionState::InProlog)
@@ -190,8 +196,30 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
         State = FunctionState::FinishedEpilog;
         break;
 
-      case X86::LEA64r:
       case X86::MOV64rr:
+        if (State == FunctionState::InEpilog) {
+          // If the prolog contains a stack allocation, then the first
+          // instruction in the epilog must be to adjust the stack pointer.
+          if (!HasSetFrame)
+            return rejectCurrentFunctionInternalError(
+                MF, Mode,
+                "The epilog is setting frame back, but prolog did not set it");
+          if (PoppedRegCount > 0)
+            return rejectCurrentFunctionInternalError(
+                MF, Mode,
+                "The epilog is setting the frame back after popping "
+                "registers");
+          if (HasStackDealloc)
+            return rejectCurrentFunctionInternalError(
+                MF, Mode,
+                "Cannot set the frame back after the stack "
+                "allocation has been deallocated");
+        } else if (State == FunctionState::FinishedEpilog)
+          return rejectCurrentFunctionInternalError(
+              MF, Mode, "Unexpected mov instruction after the epilog");
+        break;
+
+      case X86::LEA64r:
       case X86::ADD64ri32:
         if (State == FunctionState::InEpilog) {
           // If the prolog contains a stack allocation, then the first
@@ -211,8 +239,7 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
           HasStackDealloc = true;
         } else if (State == FunctionState::FinishedEpilog)
           return rejectCurrentFunctionInternalError(
-              MF, Mode,
-              "Unexpected lea, mov or add instruction after the epilog");
+              MF, Mode, "Unexpected lea or add instruction after the epilog");
         break;
 
       case X86::POP64r:
@@ -278,11 +305,8 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
-  if (UnwindV2StartLocations.empty()) {
-    assert(State == FunctionState::InProlog &&
-           "If there are no epilogs, then there should be no prolog");
+  if (UnwindV2StartLocations.empty())
     return false;
-  }
 
   MachineBasicBlock &FirstMBB = MF.front();
   // Assume +1 for the "header" UOP_Epilog that contains the epilog size, and
diff --git a/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp b/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
index 6921f44b700c..096ad08d8a3c 100644
--- a/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
+++ b/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
@@ -71,113 +71,11 @@ static bool readInstruction32(ArrayRef<uint8_t> Bytes, uint64_t Address,
 
 static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo) {
   const MCRegisterInfo *RegInfo = D->getContext().getRegisterInfo();
-  return *(RegInfo->getRegClass(RC).begin() + RegNo);
+  return RegInfo->getRegClass(RC).getRegister(RegNo);
 }
 
 static DecodeStatus DecodeGRRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                               uint64_t Address,
-                                              const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeRRegsRegisterClass(MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeBitpOperand(MCInst &Inst, unsigned Val,
-                                      uint64_t Address,
-                                      const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeNegImmOperand(MCInst &Inst, unsigned Val,
-                                        uint64_t Address,
-                                        const MCDisassembler *Decoder);
-
-static DecodeStatus Decode2RInstruction(MCInst &Inst, unsigned Insn,
-                                        uint64_t Address,
-                                        const MCDisassembler *Decoder);
-
-static DecodeStatus Decode2RImmInstruction(MCInst &Inst, unsigned Insn,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeR2RInstruction(MCInst &Inst, unsigned Insn,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder);
-
-static DecodeStatus Decode2RSrcDstInstruction(MCInst &Inst, unsigned Insn,
-                                              uint64_t Address,
-                                              const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeRUSInstruction(MCInst &Inst, unsigned Insn,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeRUSBitpInstruction(MCInst &Inst, unsigned Insn,
-                                             uint64_t Address,
-                                             const MCDisassembler *Decoder);
-
-static DecodeStatus
-DecodeRUSSrcDstBitpInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
-                               const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeL2RInstruction(MCInst &Inst, unsigned Insn,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeLR2RInstruction(MCInst &Inst, unsigned Insn,
-                                          uint64_t Address,
-                                          const MCDisassembler *Decoder);
-
-static DecodeStatus Decode3RInstruction(MCInst &Inst, unsigned Insn,
-                                        uint64_t Address,
-                                        const MCDisassembler *Decoder);
-
-static DecodeStatus Decode3RImmInstruction(MCInst &Inst, unsigned Insn,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder);
-
-static DecodeStatus Decode2RUSInstruction(MCInst &Inst, unsigned Insn,
-                                          uint64_t Address,
-                                          const MCDisassembler *Decoder);
-
-static DecodeStatus Decode2RUSBitpInstruction(MCInst &Inst, unsigned Insn,
-                                              uint64_t Address,
-                                              const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeL3RInstruction(MCInst &Inst, unsigned Insn,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeL3RSrcDstInstruction(MCInst &Inst, unsigned Insn,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeL2RUSInstruction(MCInst &Inst, unsigned Insn,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeL2RUSBitpInstruction(MCInst &Inst, unsigned Insn,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeL6RInstruction(MCInst &Inst, unsigned Insn,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeL5RInstruction(MCInst &Inst, unsigned Insn,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder);
-
-static DecodeStatus DecodeL4RSrcDstInstruction(MCInst &Inst, unsigned Insn,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder);
-
-static DecodeStatus
-DecodeL4RSrcDstSrcDstInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                 const MCDisassembler *Decoder);
-
-#include "XCoreGenDisassemblerTables.inc"
-
-static DecodeStatus DecodeGRRegsRegisterClass(MCInst &Inst, unsigned RegNo,
-                                              uint64_t Address,
                                               const MCDisassembler *Decoder) {
   if (RegNo > 11)
     return MCDisassembler::Fail;
@@ -249,6 +147,116 @@ Decode3OpInstruction(unsigned Insn, unsigned &Op1, unsigned &Op2,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus Decode3RInstruction(MCInst &Inst, unsigned Insn,
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus Decode3RImmInstruction(MCInst &Inst, unsigned Insn,
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    Inst.addOperand(MCOperand::createImm(Op1));
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus Decode2RUSInstruction(MCInst &Inst, unsigned Insn,
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    Inst.addOperand(MCOperand::createImm(Op3));
+  }
+  return S;
+}
+
+static DecodeStatus Decode2RUSBitpInstruction(MCInst &Inst, unsigned Insn,
+                                              uint64_t Address,
+                                              const MCDisassembler *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeBitpOperand(Inst, Op3, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus DecodeL3RInstruction(MCInst &Inst, unsigned Insn,
+                                         uint64_t Address,
+                                         const MCDisassembler *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S =
+    Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus DecodeL3RSrcDstInstruction(MCInst &Inst, unsigned Insn,
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S =
+  Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus DecodeL2RUSInstruction(MCInst &Inst, unsigned Insn,
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S =
+  Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    Inst.addOperand(MCOperand::createImm(Op3));
+  }
+  return S;
+}
+
+static DecodeStatus DecodeL2RUSBitpInstruction(MCInst &Inst, unsigned Insn,
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S =
+  Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeBitpOperand(Inst, Op3, Address, Decoder);
+  }
+  return S;
+}
+
+
 static DecodeStatus Decode2OpInstructionFail(MCInst &Inst, unsigned Insn,
                                              uint64_t Address,
                                              const MCDisassembler *Decoder) {
@@ -511,115 +519,6 @@ static DecodeStatus DecodeLR2RInstruction(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus Decode3RInstruction(MCInst &Inst, unsigned Insn,
-                                        uint64_t Address,
-                                        const MCDisassembler *Decoder) {
-  unsigned Op1, Op2, Op3;
-  DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
-  if (S == MCDisassembler::Success) {
-    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
-    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
-    DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
-  }
-  return S;
-}
-
-static DecodeStatus Decode3RImmInstruction(MCInst &Inst, unsigned Insn,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder) {
-  unsigned Op1, Op2, Op3;
-  DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
-  if (S == MCDisassembler::Success) {
-    Inst.addOperand(MCOperand::createImm(Op1));
-    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
-    DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
-  }
-  return S;
-}
-
-static DecodeStatus Decode2RUSInstruction(MCInst &Inst, unsigned Insn,
-                                          uint64_t Address,
-                                          const MCDisassembler *Decoder) {
-  unsigned Op1, Op2, Op3;
-  DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
-  if (S == MCDisassembler::Success) {
-    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
-    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
-    Inst.addOperand(MCOperand::createImm(Op3));
-  }
-  return S;
-}
-
-static DecodeStatus Decode2RUSBitpInstruction(MCInst &Inst, unsigned Insn,
-                                              uint64_t Address,
-                                              const MCDisassembler *Decoder) {
-  unsigned Op1, Op2, Op3;
-  DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
-  if (S == MCDisassembler::Success) {
-    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
-    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
-    DecodeBitpOperand(Inst, Op3, Address, Decoder);
-  }
-  return S;
-}
-
-static DecodeStatus DecodeL3RInstruction(MCInst &Inst, unsigned Insn,
-                                         uint64_t Address,
-                                         const MCDisassembler *Decoder) {
-  unsigned Op1, Op2, Op3;
-  DecodeStatus S =
-    Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
-  if (S == MCDisassembler::Success) {
-    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
-    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
-    DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
-  }
-  return S;
-}
-
-static DecodeStatus DecodeL3RSrcDstInstruction(MCInst &Inst, unsigned Insn,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder) {
-  unsigned Op1, Op2, Op3;
-  DecodeStatus S =
-  Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
-  if (S == MCDisassembler::Success) {
-    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
-    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
-    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
-    DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
-  }
-  return S;
-}
-
-static DecodeStatus DecodeL2RUSInstruction(MCInst &Inst, unsigned Insn,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder) {
-  unsigned Op1, Op2, Op3;
-  DecodeStatus S =
-  Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
-  if (S == MCDisassembler::Success) {
-    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
-    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
-    Inst.addOperand(MCOperand::createImm(Op3));
-  }
-  return S;
-}
-
-static DecodeStatus DecodeL2RUSBitpInstruction(MCInst &Inst, unsigned Insn,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder) {
-  unsigned Op1, Op2, Op3;
-  DecodeStatus S =
-  Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
-  if (S == MCDisassembler::Success) {
-    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
-    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
-    DecodeBitpOperand(Inst, Op3, Address, Decoder);
-  }
-  return S;
-}
-
 static DecodeStatus DecodeL6RInstruction(MCInst &Inst, unsigned Insn,
                                          uint64_t Address,
                                          const MCDisassembler *Decoder) {
@@ -713,6 +612,8 @@ DecodeL4RSrcDstSrcDstInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
   return S;
 }
 
+#include "XCoreGenDisassemblerTables.inc"
+
 MCDisassembler::DecodeStatus
 XCoreDisassembler::getInstruction(MCInst &instr, uint64_t &Size,
                                   ArrayRef<uint8_t> Bytes, uint64_t Address,
diff --git a/llvm/lib/Target/XCore/XCoreInstrInfo.cpp b/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
index 0a86588b6bdb..1a9133aad4dd 100644
--- a/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
+++ b/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
@@ -12,6 +12,7 @@
 
 #include "XCoreInstrInfo.h"
 #include "XCore.h"
+#include "XCoreSubtarget.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -41,10 +42,9 @@ namespace XCore {
 // Pin the vtable to this file.
 void XCoreInstrInfo::anchor() {}
 
-XCoreInstrInfo::XCoreInstrInfo()
-  : XCoreGenInstrInfo(XCore::ADJCALLSTACKDOWN, XCore::ADJCALLSTACKUP),
-    RI() {
-}
+XCoreInstrInfo::XCoreInstrInfo(const XCoreSubtarget &ST)
+    : XCoreGenInstrInfo(ST, XCore::ADJCALLSTACKDOWN, XCore::ADJCALLSTACKUP),
+      RI() {}
 
 static bool isZeroImm(const MachineOperand &op) {
   return op.isImm() && op.getImm() == 0;
diff --git a/llvm/lib/Target/XCore/XCoreInstrInfo.h b/llvm/lib/Target/XCore/XCoreInstrInfo.h
index 5026671616fa..354339265378 100644
--- a/llvm/lib/Target/XCore/XCoreInstrInfo.h
+++ b/llvm/lib/Target/XCore/XCoreInstrInfo.h
@@ -20,12 +20,13 @@
 #include "XCoreGenInstrInfo.inc"
 
 namespace llvm {
+class XCoreSubtarget;
 
 class XCoreInstrInfo : public XCoreGenInstrInfo {
   const XCoreRegisterInfo RI;
   virtual void anchor();
 public:
-  XCoreInstrInfo();
+  explicit XCoreInstrInfo(const XCoreSubtarget &ST);
 
   /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
   /// such, whenever a client has an instance of instruction info, it should
diff --git a/llvm/lib/Target/XCore/XCoreSubtarget.cpp b/llvm/lib/Target/XCore/XCoreSubtarget.cpp
index d4b777ef447f..2f6517ec9e7a 100644
--- a/llvm/lib/Target/XCore/XCoreSubtarget.cpp
+++ b/llvm/lib/Target/XCore/XCoreSubtarget.cpp
@@ -26,5 +26,5 @@ void XCoreSubtarget::anchor() { }
 
 XCoreSubtarget::XCoreSubtarget(const Triple &TT, const std::string &CPU,
                                const std::string &FS, const TargetMachine &TM)
-    : XCoreGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), FrameLowering(*this),
-      TLInfo(TM, *this) {}
+    : XCoreGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), InstrInfo(*this),
+      FrameLowering(*this), TLInfo(TM, *this) {}
diff --git a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
index f1367037bdf4..c211777e6989 100644
--- a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
@@ -232,12 +232,6 @@ XtensaTargetLowering::XtensaTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);
     setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);
     setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);
-
-    setCondCodeAction(ISD::SETOGT, MVT::f32, Expand);
-    setCondCodeAction(ISD::SETOGE, MVT::f32, Expand);
-    setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
-    setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
-    setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
   } else {
     setOperationAction(ISD::BITCAST, MVT::i32, Expand);
     setOperationAction(ISD::BITCAST, MVT::f32, Expand);
@@ -887,6 +881,16 @@ static std::pair<unsigned, unsigned> getFPBranchKind(ISD::CondCode Cond) {
     return std::make_pair(Xtensa::BF, Xtensa::OLT_S);
   case ISD::SETGT:
     return std::make_pair(Xtensa::BF, Xtensa::OLE_S);
+  case ISD::SETOGT:
+    return std::make_pair(Xtensa::BF, Xtensa::ULE_S);
+  case ISD::SETOGE:
+    return std::make_pair(Xtensa::BF, Xtensa::ULT_S);
+  case ISD::SETONE:
+    return std::make_pair(Xtensa::BF, Xtensa::UEQ_S);
+  case ISD::SETUGT:
+    return std::make_pair(Xtensa::BF, Xtensa::OLE_S);
+  case ISD::SETUGE:
+    return std::make_pair(Xtensa::BF, Xtensa::OLT_S);
   default:
     llvm_unreachable("Invalid condition!");
   }
diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
index 55c0729a0c9e..b0f924f2cd58 100644
--- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
@@ -48,7 +48,7 @@ addFrameReference(const MachineInstrBuilder &MIB, int FI) {
 }
 
 XtensaInstrInfo::XtensaInstrInfo(const XtensaSubtarget &STI)
-    : XtensaGenInstrInfo(Xtensa::ADJCALLSTACKDOWN, Xtensa::ADJCALLSTACKUP),
+    : XtensaGenInstrInfo(STI, Xtensa::ADJCALLSTACKDOWN, Xtensa::ADJCALLSTACKUP),
       RI(STI), STI(STI) {}
 
 Register XtensaInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 24827537eb19..63848160636a 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -1396,7 +1396,6 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
     setFeature(X86::FEATURE_BMI2);
   if (HasLeaf7 && ((EBX >> 16) & 1) && HasAVX512Save) {
     setFeature(X86::FEATURE_AVX512F);
-    setFeature(X86::FEATURE_EVEX512);
   }
   if (HasLeaf7 && ((EBX >> 17) & 1) && HasAVX512Save)
     setFeature(X86::FEATURE_AVX512DQ);
@@ -2063,8 +2062,6 @@ StringMap<bool> sys::getHostCPUFeatures() {
   Features["rtm"]        = HasLeaf7 && ((EBX >> 11) & 1);
   // AVX512 is only supported if the OS supports the context save for it.
   Features["avx512f"]    = HasLeaf7 && ((EBX >> 16) & 1) && HasAVX512Save;
-  if (Features["avx512f"])
-    Features["evex512"]  = true;
   Features["avx512dq"]   = HasLeaf7 && ((EBX >> 17) & 1) && HasAVX512Save;
   Features["rdseed"]     = HasLeaf7 && ((EBX >> 18) & 1);
   Features["adx"]        = HasLeaf7 && ((EBX >> 19) & 1);
@@ -2176,11 +2173,8 @@ StringMap<bool> sys::getHostCPUFeatures() {
       MaxLevel >= 0x24 && !getX86CpuIDAndInfo(0x24, &EAX, &EBX, &ECX, &EDX);
 
   int AVX10Ver = HasLeaf24 && (EBX & 0xff);
-  int Has512Len = HasLeaf24 && ((EBX >> 18) & 1);
-  Features["avx10.1-256"] = HasAVX10 && AVX10Ver >= 1;
-  Features["avx10.1-512"] = HasAVX10 && AVX10Ver >= 1 && Has512Len;
-  Features["avx10.2-256"] = HasAVX10 && AVX10Ver >= 2;
-  Features["avx10.2-512"] = HasAVX10 && AVX10Ver >= 2 && Has512Len;
+  Features["avx10.1"] = HasAVX10 && AVX10Ver >= 1;
+  Features["avx10.2"] = HasAVX10 && AVX10Ver >= 2;
 
   return Features;
 }
diff --git a/llvm/lib/TargetParser/RISCVTargetParser.cpp b/llvm/lib/TargetParser/RISCVTargetParser.cpp
index 9957ec0c28d8..b53a1b95431a 100644
--- a/llvm/lib/TargetParser/RISCVTargetParser.cpp
+++ b/llvm/lib/TargetParser/RISCVTargetParser.cpp
@@ -153,12 +153,13 @@ namespace RISCVVType {
 //
 // Bits | Name       | Description
 // -----+------------+------------------------------------------------
+// 8    | altfmt     | Alternative format for bf16
 // 7    | vma        | Vector mask agnostic
 // 6    | vta        | Vector tail agnostic
 // 5:3  | vsew[2:0]  | Standard element width (SEW) setting
 // 2:0  | vlmul[2:0] | Vector register group multiplier (LMUL) setting
 unsigned encodeVTYPE(VLMUL VLMul, unsigned SEW, bool TailAgnostic,
-                     bool MaskAgnostic) {
+                     bool MaskAgnostic, bool AltFmt) {
   assert(isValidSEW(SEW) && "Invalid SEW");
   unsigned VLMulBits = static_cast<unsigned>(VLMul);
   unsigned VSEWBits = encodeSEW(SEW);
@@ -167,6 +168,8 @@ unsigned encodeVTYPE(VLMUL VLMul, unsigned SEW, bool TailAgnostic,
     VTypeI |= 0x40;
   if (MaskAgnostic)
     VTypeI |= 0x80;
+  if (AltFmt)
+    VTypeI |= 0x100;
 
   return VTypeI;
 }
@@ -200,6 +203,10 @@ void printVType(unsigned VType, raw_ostream &OS) {
   unsigned Sew = getSEW(VType);
   OS << "e" << Sew;
 
+  bool AltFmt = RISCVVType::isAltFmt(VType);
+  if (AltFmt)
+    OS << "alt";
+
   unsigned LMul;
   bool Fractional;
   std::tie(LMul, Fractional) = decodeVLMUL(getVLMUL(VType));
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index 50b97d325754..2194ef4df14d 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -364,8 +364,326 @@ StringRef AMDGPU::getCanonicalArchName(const Triple &T, StringRef Arch) {
   return T.isAMDGCN() ? getArchNameAMDGCN(ProcKind) : getArchNameR600(ProcKind);
 }
 
-void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
-                                  StringMap<bool> &Features) {
+static std::pair<FeatureError, StringRef>
+insertWaveSizeFeature(StringRef GPU, const Triple &T,
+                      const StringMap<bool> &DefaultFeatures,
+                      StringMap<bool> &Features) {
+  const bool IsNullGPU = GPU.empty();
+  const bool TargetHasWave32 = DefaultFeatures.count("wavefrontsize32");
+  const bool TargetHasWave64 = DefaultFeatures.count("wavefrontsize64");
+  const bool HaveWave32 = Features.count("wavefrontsize32");
+  const bool HaveWave64 = Features.count("wavefrontsize64");
+  if (HaveWave32 && HaveWave64)
+    return {AMDGPU::INVALID_FEATURE_COMBINATION,
+            "'wavefrontsize32' and 'wavefrontsize64' are mutually exclusive"};
+
+  if (HaveWave32 && !IsNullGPU && TargetHasWave64)
+    return {AMDGPU::UNSUPPORTED_TARGET_FEATURE, "wavefrontsize32"};
+
+  if (HaveWave64 && !IsNullGPU && TargetHasWave32)
+    return {AMDGPU::UNSUPPORTED_TARGET_FEATURE, "wavefrontsize64"};
+
+  // Don't assume any wavesize with an unknown subtarget.
+  // Default to wave32 if target supports both.
+  if (!IsNullGPU && !HaveWave32 && !HaveWave64 && !TargetHasWave32 &&
+      !TargetHasWave64)
+    Features.insert(std::make_pair("wavefrontsize32", true));
+
+  for (const auto &Entry : DefaultFeatures) {
+    if (!Features.count(Entry.getKey()))
+      Features[Entry.getKey()] = Entry.getValue();
+  }
+
+  return {NO_ERROR, StringRef()};
+}
+
+/// Fills Features map with default values for given target GPU.
+/// \p Features contains overriding target features and this function returns
+/// default target features with entries overridden by \p Features.
+static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T,
+                                 StringMap<bool> &Features) {
+  AMDGPU::GPUKind Kind = parseArchAMDGCN(GPU);
+  switch (Kind) {
+  case GK_GFX1250:
+    Features["ci-insts"] = true;
+    Features["dot7-insts"] = true;
+    Features["dot8-insts"] = true;
+    Features["dl-insts"] = true;
+    Features["16-bit-insts"] = true;
+    Features["dpp"] = true;
+    Features["gfx8-insts"] = true;
+    Features["gfx9-insts"] = true;
+    Features["gfx10-insts"] = true;
+    Features["gfx10-3-insts"] = true;
+    Features["gfx11-insts"] = true;
+    Features["gfx12-insts"] = true;
+    Features["gfx1250-insts"] = true;
+    Features["bitop3-insts"] = true;
+    Features["prng-inst"] = true;
+    Features["tanh-insts"] = true;
+    Features["tensor-cvt-lut-insts"] = true;
+    Features["transpose-load-f4f6-insts"] = true;
+    Features["bf16-trans-insts"] = true;
+    Features["bf16-cvt-insts"] = true;
+    Features["fp8-conversion-insts"] = true;
+    Features["fp8e5m3-insts"] = true;
+    Features["permlane16-swap"] = true;
+    Features["ashr-pk-insts"] = true;
+    Features["atomic-buffer-pk-add-bf16-inst"] = true;
+    Features["vmem-pref-insts"] = true;
+    Features["atomic-fadd-rtn-insts"] = true;
+    Features["atomic-buffer-global-pk-add-f16-insts"] = true;
+    Features["atomic-flat-pk-add-16-insts"] = true;
+    Features["atomic-global-pk-add-bf16-inst"] = true;
+    Features["atomic-ds-pk-add-16-insts"] = true;
+    Features["setprio-inc-wg-inst"] = true;
+    Features["atomic-fmin-fmax-global-f32"] = true;
+    Features["atomic-fmin-fmax-global-f64"] = true;
+    Features["wavefrontsize32"] = true;
+    break;
+  case GK_GFX1201:
+  case GK_GFX1200:
+  case GK_GFX12_GENERIC:
+    Features["ci-insts"] = true;
+    Features["dot7-insts"] = true;
+    Features["dot8-insts"] = true;
+    Features["dot9-insts"] = true;
+    Features["dot10-insts"] = true;
+    Features["dot11-insts"] = true;
+    Features["dot12-insts"] = true;
+    Features["dl-insts"] = true;
+    Features["atomic-ds-pk-add-16-insts"] = true;
+    Features["atomic-flat-pk-add-16-insts"] = true;
+    Features["atomic-buffer-global-pk-add-f16-insts"] = true;
+    Features["atomic-buffer-pk-add-bf16-inst"] = true;
+    Features["atomic-global-pk-add-bf16-inst"] = true;
+    Features["16-bit-insts"] = true;
+    Features["dpp"] = true;
+    Features["gfx8-insts"] = true;
+    Features["gfx9-insts"] = true;
+    Features["gfx10-insts"] = true;
+    Features["gfx10-3-insts"] = true;
+    Features["gfx11-insts"] = true;
+    Features["gfx12-insts"] = true;
+    Features["atomic-fadd-rtn-insts"] = true;
+    Features["image-insts"] = true;
+    Features["fp8-conversion-insts"] = true;
+    Features["atomic-fmin-fmax-global-f32"] = true;
+    break;
+  case GK_GFX1153:
+  case GK_GFX1152:
+  case GK_GFX1151:
+  case GK_GFX1150:
+  case GK_GFX1103:
+  case GK_GFX1102:
+  case GK_GFX1101:
+  case GK_GFX1100:
+  case GK_GFX11_GENERIC:
+    Features["ci-insts"] = true;
+    Features["dot5-insts"] = true;
+    Features["dot7-insts"] = true;
+    Features["dot8-insts"] = true;
+    Features["dot9-insts"] = true;
+    Features["dot10-insts"] = true;
+    Features["dot12-insts"] = true;
+    Features["dl-insts"] = true;
+    Features["16-bit-insts"] = true;
+    Features["dpp"] = true;
+    Features["gfx8-insts"] = true;
+    Features["gfx9-insts"] = true;
+    Features["gfx10-insts"] = true;
+    Features["gfx10-3-insts"] = true;
+    Features["gfx11-insts"] = true;
+    Features["atomic-fadd-rtn-insts"] = true;
+    Features["image-insts"] = true;
+    Features["gws"] = true;
+    Features["atomic-fmin-fmax-global-f32"] = true;
+    break;
+  case GK_GFX1036:
+  case GK_GFX1035:
+  case GK_GFX1034:
+  case GK_GFX1033:
+  case GK_GFX1032:
+  case GK_GFX1031:
+  case GK_GFX1030:
+  case GK_GFX10_3_GENERIC:
+    Features["ci-insts"] = true;
+    Features["dot1-insts"] = true;
+    Features["dot2-insts"] = true;
+    Features["dot5-insts"] = true;
+    Features["dot6-insts"] = true;
+    Features["dot7-insts"] = true;
+    Features["dot10-insts"] = true;
+    Features["dl-insts"] = true;
+    Features["16-bit-insts"] = true;
+    Features["dpp"] = true;
+    Features["gfx8-insts"] = true;
+    Features["gfx9-insts"] = true;
+    Features["gfx10-insts"] = true;
+    Features["gfx10-3-insts"] = true;
+    Features["image-insts"] = true;
+    Features["s-memrealtime"] = true;
+    Features["s-memtime-inst"] = true;
+    Features["gws"] = true;
+    Features["vmem-to-lds-load-insts"] = true;
+    Features["atomic-fmin-fmax-global-f32"] = true;
+    Features["atomic-fmin-fmax-global-f64"] = true;
+    break;
+  case GK_GFX1012:
+  case GK_GFX1011:
+    Features["dot1-insts"] = true;
+    Features["dot2-insts"] = true;
+    Features["dot5-insts"] = true;
+    Features["dot6-insts"] = true;
+    Features["dot7-insts"] = true;
+    Features["dot10-insts"] = true;
+    [[fallthrough]];
+  case GK_GFX1013:
+  case GK_GFX1010:
+  case GK_GFX10_1_GENERIC:
+    Features["dl-insts"] = true;
+    Features["ci-insts"] = true;
+    Features["16-bit-insts"] = true;
+    Features["dpp"] = true;
+    Features["gfx8-insts"] = true;
+    Features["gfx9-insts"] = true;
+    Features["gfx10-insts"] = true;
+    Features["image-insts"] = true;
+    Features["s-memrealtime"] = true;
+    Features["s-memtime-inst"] = true;
+    Features["gws"] = true;
+    Features["vmem-to-lds-load-insts"] = true;
+    Features["atomic-fmin-fmax-global-f32"] = true;
+    Features["atomic-fmin-fmax-global-f64"] = true;
+    break;
+  case GK_GFX950:
+    Features["bitop3-insts"] = true;
+    Features["fp6bf6-cvt-scale-insts"] = true;
+    Features["fp4-cvt-scale-insts"] = true;
+    Features["bf8-cvt-scale-insts"] = true;
+    Features["fp8-cvt-scale-insts"] = true;
+    Features["f16bf16-to-fp6bf6-cvt-scale-insts"] = true;
+    Features["f32-to-f16bf16-cvt-sr-insts"] = true;
+    Features["prng-inst"] = true;
+    Features["permlane16-swap"] = true;
+    Features["permlane32-swap"] = true;
+    Features["ashr-pk-insts"] = true;
+    Features["dot12-insts"] = true;
+    Features["dot13-insts"] = true;
+    Features["atomic-buffer-pk-add-bf16-inst"] = true;
+    Features["gfx950-insts"] = true;
+    [[fallthrough]];
+  case GK_GFX942:
+    Features["fp8-insts"] = true;
+    Features["fp8-conversion-insts"] = true;
+    if (Kind != GK_GFX950)
+      Features["xf32-insts"] = true;
+    [[fallthrough]];
+  case GK_GFX9_4_GENERIC:
+    Features["gfx940-insts"] = true;
+    Features["atomic-ds-pk-add-16-insts"] = true;
+    Features["atomic-flat-pk-add-16-insts"] = true;
+    Features["atomic-global-pk-add-bf16-inst"] = true;
+    Features["gfx90a-insts"] = true;
+    Features["atomic-buffer-global-pk-add-f16-insts"] = true;
+    Features["atomic-fadd-rtn-insts"] = true;
+    Features["dot3-insts"] = true;
+    Features["dot4-insts"] = true;
+    Features["dot5-insts"] = true;
+    Features["dot6-insts"] = true;
+    Features["mai-insts"] = true;
+    Features["dl-insts"] = true;
+    Features["dot1-insts"] = true;
+    Features["dot2-insts"] = true;
+    Features["dot7-insts"] = true;
+    Features["dot10-insts"] = true;
+    Features["gfx9-insts"] = true;
+    Features["gfx8-insts"] = true;
+    Features["16-bit-insts"] = true;
+    Features["dpp"] = true;
+    Features["s-memrealtime"] = true;
+    Features["ci-insts"] = true;
+    Features["s-memtime-inst"] = true;
+    Features["gws"] = true;
+    Features["vmem-to-lds-load-insts"] = true;
+    Features["atomic-fmin-fmax-global-f64"] = true;
+    Features["wavefrontsize64"] = true;
+    break;
+  case GK_GFX90A:
+    Features["gfx90a-insts"] = true;
+    Features["atomic-buffer-global-pk-add-f16-insts"] = true;
+    Features["atomic-fadd-rtn-insts"] = true;
+    Features["atomic-fmin-fmax-global-f64"] = true;
+    [[fallthrough]];
+  case GK_GFX908:
+    Features["dot3-insts"] = true;
+    Features["dot4-insts"] = true;
+    Features["dot5-insts"] = true;
+    Features["dot6-insts"] = true;
+    Features["mai-insts"] = true;
+    [[fallthrough]];
+  case GK_GFX906:
+    Features["dl-insts"] = true;
+    Features["dot1-insts"] = true;
+    Features["dot2-insts"] = true;
+    Features["dot7-insts"] = true;
+    Features["dot10-insts"] = true;
+    [[fallthrough]];
+  case GK_GFX90C:
+  case GK_GFX909:
+  case GK_GFX904:
+  case GK_GFX902:
+  case GK_GFX900:
+  case GK_GFX9_GENERIC:
+    Features["gfx9-insts"] = true;
+    Features["vmem-to-lds-load-insts"] = true;
+    [[fallthrough]];
+  case GK_GFX810:
+  case GK_GFX805:
+  case GK_GFX803:
+  case GK_GFX802:
+  case GK_GFX801:
+    Features["gfx8-insts"] = true;
+    Features["16-bit-insts"] = true;
+    Features["dpp"] = true;
+    Features["s-memrealtime"] = true;
+    Features["ci-insts"] = true;
+    Features["image-insts"] = true;
+    Features["s-memtime-inst"] = true;
+    Features["gws"] = true;
+    Features["wavefrontsize64"] = true;
+    break;
+  case GK_GFX705:
+  case GK_GFX704:
+  case GK_GFX703:
+  case GK_GFX702:
+  case GK_GFX701:
+  case GK_GFX700:
+    Features["ci-insts"] = true;
+    [[fallthrough]];
+  case GK_GFX602:
+  case GK_GFX601:
+  case GK_GFX600:
+    Features["image-insts"] = true;
+    Features["s-memtime-inst"] = true;
+    Features["gws"] = true;
+    Features["atomic-fmin-fmax-global-f32"] = true;
+    Features["atomic-fmin-fmax-global-f64"] = true;
+    Features["wavefrontsize64"] = true;
+    break;
+  case GK_NONE:
+    break;
+  default:
+    llvm_unreachable("Unhandled GPU!");
+  }
+}
+
+/// Fills Features map with default values for given target GPU.
+/// \p Features contains overriding target features and this function returns
+/// default target features with entries overridden by \p Features.
+std::pair<FeatureError, StringRef>
+AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
+                             StringMap<bool> &Features) {
   // XXX - What does the member GPU mean if device name string passed here?
   if (T.isSPIRV() && T.getOS() == Triple::OSType::AMDHSA) {
     // AMDGCN SPIRV must support the union of all AMDGCN features. This list
@@ -434,276 +752,9 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
     Features["wavefrontsize32"] = true;
     Features["wavefrontsize64"] = true;
   } else if (T.isAMDGCN()) {
-    AMDGPU::GPUKind Kind = parseArchAMDGCN(GPU);
-    switch (Kind) {
-    case GK_GFX1250:
-      Features["ci-insts"] = true;
-      Features["dot7-insts"] = true;
-      Features["dot8-insts"] = true;
-      Features["dl-insts"] = true;
-      Features["16-bit-insts"] = true;
-      Features["dpp"] = true;
-      Features["gfx8-insts"] = true;
-      Features["gfx9-insts"] = true;
-      Features["gfx10-insts"] = true;
-      Features["gfx10-3-insts"] = true;
-      Features["gfx11-insts"] = true;
-      Features["gfx12-insts"] = true;
-      Features["gfx1250-insts"] = true;
-      Features["bitop3-insts"] = true;
-      Features["prng-inst"] = true;
-      Features["tanh-insts"] = true;
-      Features["tensor-cvt-lut-insts"] = true;
-      Features["transpose-load-f4f6-insts"] = true;
-      Features["bf16-trans-insts"] = true;
-      Features["bf16-cvt-insts"] = true;
-      Features["fp8-conversion-insts"] = true;
-      Features["fp8e5m3-insts"] = true;
-      Features["permlane16-swap"] = true;
-      Features["ashr-pk-insts"] = true;
-      Features["atomic-buffer-pk-add-bf16-inst"] = true;
-      Features["vmem-pref-insts"] = true;
-      Features["atomic-fadd-rtn-insts"] = true;
-      Features["atomic-buffer-global-pk-add-f16-insts"] = true;
-      Features["atomic-flat-pk-add-16-insts"] = true;
-      Features["atomic-global-pk-add-bf16-inst"] = true;
-      Features["atomic-ds-pk-add-16-insts"] = true;
-      Features["setprio-inc-wg-inst"] = true;
-      Features["atomic-fmin-fmax-global-f32"] = true;
-      Features["atomic-fmin-fmax-global-f64"] = true;
-      break;
-    case GK_GFX1201:
-    case GK_GFX1200:
-    case GK_GFX12_GENERIC:
-      Features["ci-insts"] = true;
-      Features["dot7-insts"] = true;
-      Features["dot8-insts"] = true;
-      Features["dot9-insts"] = true;
-      Features["dot10-insts"] = true;
-      Features["dot11-insts"] = true;
-      Features["dot12-insts"] = true;
-      Features["dl-insts"] = true;
-      Features["atomic-ds-pk-add-16-insts"] = true;
-      Features["atomic-flat-pk-add-16-insts"] = true;
-      Features["atomic-buffer-global-pk-add-f16-insts"] = true;
-      Features["atomic-buffer-pk-add-bf16-inst"] = true;
-      Features["atomic-global-pk-add-bf16-inst"] = true;
-      Features["16-bit-insts"] = true;
-      Features["dpp"] = true;
-      Features["gfx8-insts"] = true;
-      Features["gfx9-insts"] = true;
-      Features["gfx10-insts"] = true;
-      Features["gfx10-3-insts"] = true;
-      Features["gfx11-insts"] = true;
-      Features["gfx12-insts"] = true;
-      Features["atomic-fadd-rtn-insts"] = true;
-      Features["image-insts"] = true;
-      Features["fp8-conversion-insts"] = true;
-      Features["atomic-fmin-fmax-global-f32"] = true;
-      break;
-    case GK_GFX1153:
-    case GK_GFX1152:
-    case GK_GFX1151:
-    case GK_GFX1150:
-    case GK_GFX1103:
-    case GK_GFX1102:
-    case GK_GFX1101:
-    case GK_GFX1100:
-    case GK_GFX11_GENERIC:
-      Features["ci-insts"] = true;
-      Features["dot5-insts"] = true;
-      Features["dot7-insts"] = true;
-      Features["dot8-insts"] = true;
-      Features["dot9-insts"] = true;
-      Features["dot10-insts"] = true;
-      Features["dot12-insts"] = true;
-      Features["dl-insts"] = true;
-      Features["16-bit-insts"] = true;
-      Features["dpp"] = true;
-      Features["gfx8-insts"] = true;
-      Features["gfx9-insts"] = true;
-      Features["gfx10-insts"] = true;
-      Features["gfx10-3-insts"] = true;
-      Features["gfx11-insts"] = true;
-      Features["atomic-fadd-rtn-insts"] = true;
-      Features["image-insts"] = true;
-      Features["gws"] = true;
-      Features["atomic-fmin-fmax-global-f32"] = true;
-      break;
-    case GK_GFX1036:
-    case GK_GFX1035:
-    case GK_GFX1034:
-    case GK_GFX1033:
-    case GK_GFX1032:
-    case GK_GFX1031:
-    case GK_GFX1030:
-    case GK_GFX10_3_GENERIC:
-      Features["ci-insts"] = true;
-      Features["dot1-insts"] = true;
-      Features["dot2-insts"] = true;
-      Features["dot5-insts"] = true;
-      Features["dot6-insts"] = true;
-      Features["dot7-insts"] = true;
-      Features["dot10-insts"] = true;
-      Features["dl-insts"] = true;
-      Features["16-bit-insts"] = true;
-      Features["dpp"] = true;
-      Features["gfx8-insts"] = true;
-      Features["gfx9-insts"] = true;
-      Features["gfx10-insts"] = true;
-      Features["gfx10-3-insts"] = true;
-      Features["image-insts"] = true;
-      Features["s-memrealtime"] = true;
-      Features["s-memtime-inst"] = true;
-      Features["gws"] = true;
-      Features["vmem-to-lds-load-insts"] = true;
-      Features["atomic-fmin-fmax-global-f32"] = true;
-      Features["atomic-fmin-fmax-global-f64"] = true;
-      break;
-    case GK_GFX1012:
-    case GK_GFX1011:
-      Features["dot1-insts"] = true;
-      Features["dot2-insts"] = true;
-      Features["dot5-insts"] = true;
-      Features["dot6-insts"] = true;
-      Features["dot7-insts"] = true;
-      Features["dot10-insts"] = true;
-      [[fallthrough]];
-    case GK_GFX1013:
-    case GK_GFX1010:
-    case GK_GFX10_1_GENERIC:
-      Features["dl-insts"] = true;
-      Features["ci-insts"] = true;
-      Features["16-bit-insts"] = true;
-      Features["dpp"] = true;
-      Features["gfx8-insts"] = true;
-      Features["gfx9-insts"] = true;
-      Features["gfx10-insts"] = true;
-      Features["image-insts"] = true;
-      Features["s-memrealtime"] = true;
-      Features["s-memtime-inst"] = true;
-      Features["gws"] = true;
-      Features["vmem-to-lds-load-insts"] = true;
-      Features["atomic-fmin-fmax-global-f32"] = true;
-      Features["atomic-fmin-fmax-global-f64"] = true;
-      break;
-    case GK_GFX950:
-      Features["bitop3-insts"] = true;
-      Features["fp6bf6-cvt-scale-insts"] = true;
-      Features["fp4-cvt-scale-insts"] = true;
-      Features["bf8-cvt-scale-insts"] = true;
-      Features["fp8-cvt-scale-insts"] = true;
-      Features["f16bf16-to-fp6bf6-cvt-scale-insts"] = true;
-      Features["f32-to-f16bf16-cvt-sr-insts"] = true;
-      Features["prng-inst"] = true;
-      Features["permlane16-swap"] = true;
-      Features["permlane32-swap"] = true;
-      Features["ashr-pk-insts"] = true;
-      Features["dot12-insts"] = true;
-      Features["dot13-insts"] = true;
-      Features["atomic-buffer-pk-add-bf16-inst"] = true;
-      Features["gfx950-insts"] = true;
-      [[fallthrough]];
-    case GK_GFX942:
-      Features["fp8-insts"] = true;
-      Features["fp8-conversion-insts"] = true;
-      if (Kind != GK_GFX950)
-        Features["xf32-insts"] = true;
-      [[fallthrough]];
-    case GK_GFX9_4_GENERIC:
-      Features["gfx940-insts"] = true;
-      Features["atomic-ds-pk-add-16-insts"] = true;
-      Features["atomic-flat-pk-add-16-insts"] = true;
-      Features["atomic-global-pk-add-bf16-inst"] = true;
-      Features["gfx90a-insts"] = true;
-      Features["atomic-buffer-global-pk-add-f16-insts"] = true;
-      Features["atomic-fadd-rtn-insts"] = true;
-      Features["dot3-insts"] = true;
-      Features["dot4-insts"] = true;
-      Features["dot5-insts"] = true;
-      Features["dot6-insts"] = true;
-      Features["mai-insts"] = true;
-      Features["dl-insts"] = true;
-      Features["dot1-insts"] = true;
-      Features["dot2-insts"] = true;
-      Features["dot7-insts"] = true;
-      Features["dot10-insts"] = true;
-      Features["gfx9-insts"] = true;
-      Features["gfx8-insts"] = true;
-      Features["16-bit-insts"] = true;
-      Features["dpp"] = true;
-      Features["s-memrealtime"] = true;
-      Features["ci-insts"] = true;
-      Features["s-memtime-inst"] = true;
-      Features["gws"] = true;
-      Features["vmem-to-lds-load-insts"] = true;
-      Features["atomic-fmin-fmax-global-f64"] = true;
-      break;
-    case GK_GFX90A:
-      Features["gfx90a-insts"] = true;
-      Features["atomic-buffer-global-pk-add-f16-insts"] = true;
-      Features["atomic-fadd-rtn-insts"] = true;
-      Features["atomic-fmin-fmax-global-f64"] = true;
-      [[fallthrough]];
-    case GK_GFX908:
-      Features["dot3-insts"] = true;
-      Features["dot4-insts"] = true;
-      Features["dot5-insts"] = true;
-      Features["dot6-insts"] = true;
-      Features["mai-insts"] = true;
-      [[fallthrough]];
-    case GK_GFX906:
-      Features["dl-insts"] = true;
-      Features["dot1-insts"] = true;
-      Features["dot2-insts"] = true;
-      Features["dot7-insts"] = true;
-      Features["dot10-insts"] = true;
-      [[fallthrough]];
-    case GK_GFX90C:
-    case GK_GFX909:
-    case GK_GFX904:
-    case GK_GFX902:
-    case GK_GFX900:
-    case GK_GFX9_GENERIC:
-      Features["gfx9-insts"] = true;
-      Features["vmem-to-lds-load-insts"] = true;
-      [[fallthrough]];
-    case GK_GFX810:
-    case GK_GFX805:
-    case GK_GFX803:
-    case GK_GFX802:
-    case GK_GFX801:
-      Features["gfx8-insts"] = true;
-      Features["16-bit-insts"] = true;
-      Features["dpp"] = true;
-      Features["s-memrealtime"] = true;
-      Features["ci-insts"] = true;
-      Features["image-insts"] = true;
-      Features["s-memtime-inst"] = true;
-      Features["gws"] = true;
-      break;
-    case GK_GFX705:
-    case GK_GFX704:
-    case GK_GFX703:
-    case GK_GFX702:
-    case GK_GFX701:
-    case GK_GFX700:
-      Features["ci-insts"] = true;
-      [[fallthrough]];
-    case GK_GFX602:
-    case GK_GFX601:
-    case GK_GFX600:
-      Features["image-insts"] = true;
-      Features["s-memtime-inst"] = true;
-      Features["gws"] = true;
-      Features["atomic-fmin-fmax-global-f32"] = true;
-      Features["atomic-fmin-fmax-global-f64"] = true;
-      break;
-    case GK_NONE:
-      break;
-    default:
-      llvm_unreachable("Unhandled GPU!");
-    }
+    StringMap<bool> DefaultFeatures;
+    fillAMDGCNFeatureMap(GPU, T, DefaultFeatures);
+    return insertWaveSizeFeature(GPU, T, DefaultFeatures, Features);
   } else {
     if (GPU.empty())
       GPU = "r600";
@@ -732,70 +783,5 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
       llvm_unreachable("Unhandled GPU!");
     }
   }
-}
-
-static bool isWave32Capable(StringRef GPU, const Triple &T) {
-  bool IsWave32Capable = false;
-  // XXX - What does the member GPU mean if device name string passed here?
-  if (T.isAMDGCN()) {
-    switch (parseArchAMDGCN(GPU)) {
-    case GK_GFX1250:
-    case GK_GFX1201:
-    case GK_GFX1200:
-    case GK_GFX1153:
-    case GK_GFX1152:
-    case GK_GFX1151:
-    case GK_GFX1150:
-    case GK_GFX1103:
-    case GK_GFX1102:
-    case GK_GFX1101:
-    case GK_GFX1100:
-    case GK_GFX1036:
-    case GK_GFX1035:
-    case GK_GFX1034:
-    case GK_GFX1033:
-    case GK_GFX1032:
-    case GK_GFX1031:
-    case GK_GFX1030:
-    case GK_GFX1012:
-    case GK_GFX1011:
-    case GK_GFX1013:
-    case GK_GFX1010:
-    case GK_GFX12_GENERIC:
-    case GK_GFX11_GENERIC:
-    case GK_GFX10_3_GENERIC:
-    case GK_GFX10_1_GENERIC:
-      IsWave32Capable = true;
-      break;
-    default:
-      break;
-    }
-  }
-  return IsWave32Capable;
-}
-
-std::pair<FeatureError, StringRef>
-AMDGPU::insertWaveSizeFeature(StringRef GPU, const Triple &T,
-                              StringMap<bool> &Features) {
-  bool IsWave32Capable = isWave32Capable(GPU, T);
-  const bool IsNullGPU = GPU.empty();
-  const bool HaveWave32 = Features.count("wavefrontsize32");
-  const bool HaveWave64 = Features.count("wavefrontsize64");
-  if (HaveWave32 && HaveWave64) {
-    return {AMDGPU::INVALID_FEATURE_COMBINATION,
-            "'wavefrontsize32' and 'wavefrontsize64' are mutually exclusive"};
-  }
-  if (HaveWave32 && !IsNullGPU && !IsWave32Capable) {
-    return {AMDGPU::UNSUPPORTED_TARGET_FEATURE, "wavefrontsize32"};
-  }
-  // Don't assume any wavesize with an unknown subtarget.
-  if (!IsNullGPU) {
-    // Default to wave32 if available, or wave64 if not
-    if (!HaveWave32 && !HaveWave64) {
-      StringRef DefaultWaveSizeFeature =
-          IsWave32Capable ? "wavefrontsize32" : "wavefrontsize64";
-      Features.insert(std::make_pair(DefaultWaveSizeFeature, true));
-    }
-  }
   return {NO_ERROR, StringRef()};
 }
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index 6acb0bc49ecf..ac3626db46ea 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -158,6 +158,8 @@ StringRef Triple::getArchName(ArchType Kind, SubArchType SubArch) {
       return "dxilv1.7";
     case Triple::DXILSubArch_v1_8:
       return "dxilv1.8";
+    case Triple::DXILSubArch_v1_9:
+      return "dxilv1.9";
     default:
       break;
     }
@@ -329,6 +331,8 @@ StringRef Triple::getOSTypeName(OSType Kind) {
   case LiteOS: return "liteos";
   case XROS: return "xros";
   case Vulkan: return "vulkan";
+  case CheriotRTOS:
+    return "cheriotrtos";
   }
 
   llvm_unreachable("Invalid OSType");
@@ -387,6 +391,8 @@ StringRef Triple::getEnvironmentTypeName(EnvironmentType Kind) {
   case Callable: return "callable";
   case Mesh: return "mesh";
   case Amplification: return "amplification";
+  case RootSignature:
+    return "rootsignature";
   case OpenCL:
     return "opencl";
   case OpenHOS: return "ohos";
@@ -648,6 +654,8 @@ static Triple::ArchType parseArch(StringRef ArchName) {
                 .Cases("dxil", "dxilv1.0", "dxilv1.1", "dxilv1.2", "dxilv1.3",
                        "dxilv1.4", "dxilv1.5", "dxilv1.6", "dxilv1.7",
                        "dxilv1.8", Triple::dxil)
+                // Note: Cases has max limit of 10.
+                .Case("dxilv1.9", Triple::dxil)
                 .Case("xtensa", Triple::xtensa)
                 .Default(Triple::UnknownArch);
 
@@ -687,49 +695,50 @@ static Triple::VendorType parseVendor(StringRef VendorName) {
 
 static Triple::OSType parseOS(StringRef OSName) {
   return StringSwitch<Triple::OSType>(OSName)
-    .StartsWith("darwin", Triple::Darwin)
-    .StartsWith("dragonfly", Triple::DragonFly)
-    .StartsWith("freebsd", Triple::FreeBSD)
-    .StartsWith("fuchsia", Triple::Fuchsia)
-    .StartsWith("ios", Triple::IOS)
-    .StartsWith("kfreebsd", Triple::KFreeBSD)
-    .StartsWith("linux", Triple::Linux)
-    .StartsWith("lv2", Triple::Lv2)
-    .StartsWith("macos", Triple::MacOSX)
-    .StartsWith("managarm", Triple::Managarm)
-    .StartsWith("netbsd", Triple::NetBSD)
-    .StartsWith("openbsd", Triple::OpenBSD)
-    .StartsWith("solaris", Triple::Solaris)
-    .StartsWith("uefi", Triple::UEFI)
-    .StartsWith("win32", Triple::Win32)
-    .StartsWith("windows", Triple::Win32)
-    .StartsWith("zos", Triple::ZOS)
-    .StartsWith("haiku", Triple::Haiku)
-    .StartsWith("rtems", Triple::RTEMS)
-    .StartsWith("aix", Triple::AIX)
-    .StartsWith("cuda", Triple::CUDA)
-    .StartsWith("nvcl", Triple::NVCL)
-    .StartsWith("amdhsa", Triple::AMDHSA)
-    .StartsWith("ps4", Triple::PS4)
-    .StartsWith("ps5", Triple::PS5)
-    .StartsWith("elfiamcu", Triple::ELFIAMCU)
-    .StartsWith("tvos", Triple::TvOS)
-    .StartsWith("watchos", Triple::WatchOS)
-    .StartsWith("bridgeos", Triple::BridgeOS)
-    .StartsWith("driverkit", Triple::DriverKit)
-    .StartsWith("xros", Triple::XROS)
-    .StartsWith("visionos", Triple::XROS)
-    .StartsWith("mesa3d", Triple::Mesa3D)
-    .StartsWith("amdpal", Triple::AMDPAL)
-    .StartsWith("hermit", Triple::HermitCore)
-    .StartsWith("hurd", Triple::Hurd)
-    .StartsWith("wasi", Triple::WASI)
-    .StartsWith("emscripten", Triple::Emscripten)
-    .StartsWith("shadermodel", Triple::ShaderModel)
-    .StartsWith("liteos", Triple::LiteOS)
-    .StartsWith("serenity", Triple::Serenity)
-    .StartsWith("vulkan", Triple::Vulkan)
-    .Default(Triple::UnknownOS);
+      .StartsWith("darwin", Triple::Darwin)
+      .StartsWith("dragonfly", Triple::DragonFly)
+      .StartsWith("freebsd", Triple::FreeBSD)
+      .StartsWith("fuchsia", Triple::Fuchsia)
+      .StartsWith("ios", Triple::IOS)
+      .StartsWith("kfreebsd", Triple::KFreeBSD)
+      .StartsWith("linux", Triple::Linux)
+      .StartsWith("lv2", Triple::Lv2)
+      .StartsWith("macos", Triple::MacOSX)
+      .StartsWith("managarm", Triple::Managarm)
+      .StartsWith("netbsd", Triple::NetBSD)
+      .StartsWith("openbsd", Triple::OpenBSD)
+      .StartsWith("solaris", Triple::Solaris)
+      .StartsWith("uefi", Triple::UEFI)
+      .StartsWith("win32", Triple::Win32)
+      .StartsWith("windows", Triple::Win32)
+      .StartsWith("zos", Triple::ZOS)
+      .StartsWith("haiku", Triple::Haiku)
+      .StartsWith("rtems", Triple::RTEMS)
+      .StartsWith("aix", Triple::AIX)
+      .StartsWith("cuda", Triple::CUDA)
+      .StartsWith("nvcl", Triple::NVCL)
+      .StartsWith("amdhsa", Triple::AMDHSA)
+      .StartsWith("ps4", Triple::PS4)
+      .StartsWith("ps5", Triple::PS5)
+      .StartsWith("elfiamcu", Triple::ELFIAMCU)
+      .StartsWith("tvos", Triple::TvOS)
+      .StartsWith("watchos", Triple::WatchOS)
+      .StartsWith("bridgeos", Triple::BridgeOS)
+      .StartsWith("driverkit", Triple::DriverKit)
+      .StartsWith("xros", Triple::XROS)
+      .StartsWith("visionos", Triple::XROS)
+      .StartsWith("mesa3d", Triple::Mesa3D)
+      .StartsWith("amdpal", Triple::AMDPAL)
+      .StartsWith("hermit", Triple::HermitCore)
+      .StartsWith("hurd", Triple::Hurd)
+      .StartsWith("wasi", Triple::WASI)
+      .StartsWith("emscripten", Triple::Emscripten)
+      .StartsWith("shadermodel", Triple::ShaderModel)
+      .StartsWith("liteos", Triple::LiteOS)
+      .StartsWith("serenity", Triple::Serenity)
+      .StartsWith("vulkan", Triple::Vulkan)
+      .StartsWith("cheriotrtos", Triple::CheriotRTOS)
+      .Default(Triple::UnknownOS);
 }
 
 static Triple::EnvironmentType parseEnvironment(StringRef EnvironmentName) {
@@ -780,6 +789,7 @@ static Triple::EnvironmentType parseEnvironment(StringRef EnvironmentName) {
       .StartsWith("callable", Triple::Callable)
       .StartsWith("mesh", Triple::Mesh)
       .StartsWith("amplification", Triple::Amplification)
+      .StartsWith("rootsignature", Triple::RootSignature)
       .StartsWith("opencl", Triple::OpenCL)
       .StartsWith("ohos", Triple::OpenHOS)
       .StartsWith("pauthtest", Triple::PAuthTest)
@@ -839,6 +849,7 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) {
         .EndsWith("v1.6", Triple::DXILSubArch_v1_6)
         .EndsWith("v1.7", Triple::DXILSubArch_v1_7)
         .EndsWith("v1.8", Triple::DXILSubArch_v1_8)
+        .EndsWith("v1.9", Triple::DXILSubArch_v1_9)
         .Default(Triple::NoSubArch);
 
   StringRef ARMSubArch = ARM::getCanonicalArchName(SubArchName);
@@ -1108,7 +1119,7 @@ static StringRef getDXILArchNameFromShaderModel(StringRef ShaderModelStr) {
   VersionTuple Ver =
       parseVersionFromName(ShaderModelStr.drop_front(strlen("shadermodel")));
   // Default DXIL minor version when Shader Model version is anything other
-  // than 6.[0...8] or 6.x (which translates to latest current SM version)
+  // than 6.[0...9] or 6.x (which translates to latest current SM version)
   const unsigned SMMajor = 6;
   if (!Ver.empty()) {
     if (Ver.getMajor() == SMMajor) {
@@ -1132,6 +1143,8 @@ static StringRef getDXILArchNameFromShaderModel(StringRef ShaderModelStr) {
           return Triple::getArchName(Triple::dxil, Triple::DXILSubArch_v1_7);
         case 8:
           return Triple::getArchName(Triple::dxil, Triple::DXILSubArch_v1_8);
+        case 9:
+          return Triple::getArchName(Triple::dxil, Triple::DXILSubArch_v1_9);
         default:
           report_fatal_error("Unsupported Shader Model version", false);
         }
diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp
index b72096553ad9..edca7c18062a 100644
--- a/llvm/lib/TargetParser/X86TargetParser.cpp
+++ b/llvm/lib/TargetParser/X86TargetParser.cpp
@@ -72,7 +72,7 @@ constexpr FeatureBitset FeaturesX86_64_V2 = FeaturesX86_64 | FeatureSAHF |
 constexpr FeatureBitset FeaturesX86_64_V3 =
     FeaturesX86_64_V2 | FeatureAVX2 | FeatureBMI | FeatureBMI2 | FeatureF16C |
     FeatureFMA | FeatureLZCNT | FeatureMOVBE | FeatureXSAVE;
-constexpr FeatureBitset FeaturesX86_64_V4 = FeaturesX86_64_V3 | FeatureEVEX512 |
+constexpr FeatureBitset FeaturesX86_64_V4 = FeaturesX86_64_V3 |
                                             FeatureAVX512BW | FeatureAVX512CD |
                                             FeatureAVX512DQ | FeatureAVX512VL;
 
@@ -95,9 +95,8 @@ constexpr FeatureBitset FeaturesBroadwell =
 
 // Intel Knights Landing and Knights Mill
 // Knights Landing has feature parity with Broadwell.
-constexpr FeatureBitset FeaturesKNL = FeaturesBroadwell | FeatureAES |
-                                      FeatureAVX512F | FeatureEVEX512 |
-                                      FeatureAVX512CD;
+constexpr FeatureBitset FeaturesKNL =
+    FeaturesBroadwell | FeatureAES | FeatureAVX512F | FeatureAVX512CD;
 constexpr FeatureBitset FeaturesKNM = FeaturesKNL | FeatureAVX512VPOPCNTDQ;
 
 // Intel Skylake processors.
@@ -107,9 +106,9 @@ constexpr FeatureBitset FeaturesSkylakeClient =
 // SkylakeServer inherits all SkylakeClient features except SGX.
 // FIXME: That doesn't match gcc.
 constexpr FeatureBitset FeaturesSkylakeServer =
-    (FeaturesSkylakeClient & ~FeatureSGX) | FeatureAVX512F | FeatureEVEX512 |
-    FeatureAVX512CD | FeatureAVX512DQ | FeatureAVX512BW | FeatureAVX512VL |
-    FeatureCLWB | FeaturePKU;
+    (FeaturesSkylakeClient & ~FeatureSGX) | FeatureAVX512F | FeatureAVX512CD |
+    FeatureAVX512DQ | FeatureAVX512BW | FeatureAVX512VL | FeatureCLWB |
+    FeaturePKU;
 constexpr FeatureBitset FeaturesCascadeLake =
     FeaturesSkylakeServer | FeatureAVX512VNNI;
 constexpr FeatureBitset FeaturesCooperLake =
@@ -117,9 +116,9 @@ constexpr FeatureBitset FeaturesCooperLake =
 
 // Intel 10nm processors.
 constexpr FeatureBitset FeaturesCannonlake =
-    FeaturesSkylakeClient | FeatureAVX512F | FeatureEVEX512 | FeatureAVX512CD |
-    FeatureAVX512DQ | FeatureAVX512BW | FeatureAVX512VL | FeatureAVX512IFMA |
-    FeatureAVX512VBMI | FeaturePKU | FeatureSHA;
+    FeaturesSkylakeClient | FeatureAVX512F | FeatureAVX512CD | FeatureAVX512DQ |
+    FeatureAVX512BW | FeatureAVX512VL | FeatureAVX512IFMA | FeatureAVX512VBMI |
+    FeaturePKU | FeatureSHA;
 constexpr FeatureBitset FeaturesICLClient =
     FeaturesCannonlake | FeatureAVX512BITALG | FeatureAVX512VBMI2 |
     FeatureAVX512VNNI | FeatureAVX512VPOPCNTDQ | FeatureGFNI | FeatureRDPID |
@@ -139,7 +138,7 @@ constexpr FeatureBitset FeaturesSapphireRapids =
 constexpr FeatureBitset FeaturesGraniteRapids =
     FeaturesSapphireRapids | FeatureAMX_FP16 | FeaturePREFETCHI;
 constexpr FeatureBitset FeaturesDiamondRapids =
-    FeaturesGraniteRapids | FeatureAMX_COMPLEX | FeatureAVX10_2_512 |
+    FeaturesGraniteRapids | FeatureAMX_COMPLEX | FeatureAVX10_2 |
     FeatureCMPCCXADD | FeatureAVXIFMA | FeatureAVXNECONVERT |
     FeatureAVXVNNIINT8 | FeatureAVXVNNIINT16 | FeatureSHA512 | FeatureSM3 |
     FeatureSM4 | FeatureEGPR | FeatureZU | FeatureCCMP | FeaturePush2Pop2 |
@@ -244,11 +243,10 @@ static constexpr FeatureBitset FeaturesZNVER3 = FeaturesZNVER2 |
                                                 FeatureINVPCID | FeaturePKU |
                                                 FeatureVAES | FeatureVPCLMULQDQ;
 static constexpr FeatureBitset FeaturesZNVER4 =
-    FeaturesZNVER3 | FeatureAVX512F | FeatureEVEX512 | FeatureAVX512CD |
-    FeatureAVX512DQ | FeatureAVX512BW | FeatureAVX512VL | FeatureAVX512IFMA |
-    FeatureAVX512VBMI | FeatureAVX512VBMI2 | FeatureAVX512VNNI |
-    FeatureAVX512BITALG | FeatureAVX512VPOPCNTDQ | FeatureAVX512BF16 |
-    FeatureGFNI | FeatureSHSTK;
+    FeaturesZNVER3 | FeatureAVX512F | FeatureAVX512CD | FeatureAVX512DQ |
+    FeatureAVX512BW | FeatureAVX512VL | FeatureAVX512IFMA | FeatureAVX512VBMI |
+    FeatureAVX512VBMI2 | FeatureAVX512VNNI | FeatureAVX512BITALG |
+    FeatureAVX512VPOPCNTDQ | FeatureAVX512BF16 | FeatureGFNI | FeatureSHSTK;
 
 static constexpr FeatureBitset FeaturesZNVER5 =
     FeaturesZNVER4 | FeatureAVXVNNI | FeatureMOVDIRI | FeatureMOVDIR64B |
@@ -394,7 +392,7 @@ constexpr ProcInfo Processors[] = {
   // Clearwaterforest microarchitecture based processors.
   { {"clearwaterforest"}, CK_Lunarlake, FEATURE_AVX2, FeaturesClearwaterforest, 'p', false },
   // Diamond Rapids microarchitecture based processors.
-  { {"diamondrapids"}, CK_Diamondrapids, FEATURE_AVX10_2_512, FeaturesDiamondRapids, 'z', false },
+  { {"diamondrapids"}, CK_Diamondrapids, FEATURE_AVX10_2, FeaturesDiamondRapids, 'z', false },
   // Knights Landing processor.
   { {"knl"}, CK_KNL, FEATURE_AVX512F, FeaturesKNL, 'Z', false },
   { {"mic_avx512"}, CK_KNL, FEATURE_AVX512F, FeaturesKNL, 'Z', true },
@@ -616,7 +614,7 @@ constexpr FeatureBitset ImpliedFeaturesAMX_FP8 = FeatureAMX_TILE;
 constexpr FeatureBitset ImpliedFeaturesAMX_TRANSPOSE = FeatureAMX_TILE;
 constexpr FeatureBitset ImpliedFeaturesAMX_MOVRS = FeatureAMX_TILE;
 constexpr FeatureBitset ImpliedFeaturesAMX_AVX512 =
-    FeatureAMX_TILE | FeatureAVX10_2_512;
+    FeatureAMX_TILE | FeatureAVX10_2;
 constexpr FeatureBitset ImpliedFeaturesAMX_TF32 = FeatureAMX_TILE;
 constexpr FeatureBitset ImpliedFeaturesHRESET = {};
 
@@ -642,11 +640,9 @@ constexpr FeatureBitset ImpliedFeaturesAVX10_1 =
     FeatureAVX512VNNI | FeatureAVX512BF16 | FeatureAVX512VPOPCNTDQ |
     FeatureAVX512VBMI2 | FeatureAVX512BITALG | FeatureAVX512FP16 |
     FeatureAVX512DQ | FeatureAVX512VL;
-constexpr FeatureBitset ImpliedFeaturesAVX10_1_512 =
-    FeatureAVX10_1 | FeatureEVEX512;
 constexpr FeatureBitset ImpliedFeaturesAVX10_2 = FeatureAVX10_1;
-constexpr FeatureBitset ImpliedFeaturesAVX10_2_512 =
-    FeatureAVX10_2 | FeatureAVX10_1_512;
+constexpr FeatureBitset ImpliedFeaturesAVX10_1_512 = FeatureAVX10_1;
+constexpr FeatureBitset ImpliedFeaturesAVX10_2_512 = FeatureAVX10_2;
 
 // APX Features
 constexpr FeatureBitset ImpliedFeaturesEGPR = {};
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 40a7f8043034..40de36d81ddd 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -83,8 +83,8 @@ static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) {
     //  == (ShVal0 << ShAmt) | (ShVal1 >> (Width -ShAmt))
     if (match(V, m_OneUse(m_c_Or(
                      m_Shl(m_Value(ShVal0), m_Value(ShAmt)),
-                     m_LShr(m_Value(ShVal1),
-                            m_Sub(m_SpecificInt(Width), m_Deferred(ShAmt))))))) {
+                     m_LShr(m_Value(ShVal1), m_Sub(m_SpecificInt(Width),
+                                                   m_Deferred(ShAmt))))))) {
       return Intrinsic::fshl;
     }
 
@@ -617,7 +617,7 @@ struct LoadOps {
   LoadInst *RootInsert = nullptr;
   bool FoundRoot = false;
   uint64_t LoadSize = 0;
-  const APInt *Shift = nullptr;
+  uint64_t Shift = 0;
   Type *ZextType;
   AAMDNodes AATags;
 };
@@ -627,17 +627,15 @@ struct LoadOps {
 // (ZExt(L1) << shift1) | ZExt(L2) -> ZExt(L3)
 static bool foldLoadsRecursive(Value *V, LoadOps &LOps, const DataLayout &DL,
                                AliasAnalysis &AA) {
-  const APInt *ShAmt2 = nullptr;
+  uint64_t ShAmt2;
   Value *X;
   Instruction *L1, *L2;
 
   // Go to the last node with loads.
-  if (match(V, m_OneUse(m_c_Or(
-                   m_Value(X),
-                   m_OneUse(m_Shl(m_OneUse(m_ZExt(m_OneUse(m_Instruction(L2)))),
-                                  m_APInt(ShAmt2)))))) ||
-      match(V, m_OneUse(m_Or(m_Value(X),
-                             m_OneUse(m_ZExt(m_OneUse(m_Instruction(L2)))))))) {
+  if (match(V,
+            m_OneUse(m_c_Or(m_Value(X), m_OneUse(m_ShlOrSelf(
+                                            m_OneUse(m_ZExt(m_Instruction(L2))),
+                                            ShAmt2)))))) {
     if (!foldLoadsRecursive(X, LOps, DL, AA) && LOps.FoundRoot)
       // Avoid Partial chain merge.
       return false;
@@ -646,11 +644,10 @@ static bool foldLoadsRecursive(Value *V, LoadOps &LOps, const DataLayout &DL,
 
   // Check if the pattern has loads
   LoadInst *LI1 = LOps.Root;
-  const APInt *ShAmt1 = LOps.Shift;
+  uint64_t ShAmt1 = LOps.Shift;
   if (LOps.FoundRoot == false &&
-      (match(X, m_OneUse(m_ZExt(m_Instruction(L1)))) ||
-       match(X, m_OneUse(m_Shl(m_OneUse(m_ZExt(m_OneUse(m_Instruction(L1)))),
-                               m_APInt(ShAmt1)))))) {
+      match(X, m_OneUse(
+                   m_ShlOrSelf(m_OneUse(m_ZExt(m_Instruction(L1))), ShAmt1)))) {
     LI1 = dyn_cast<LoadInst>(L1);
   }
   LoadInst *LI2 = dyn_cast<LoadInst>(L2);
@@ -726,13 +723,6 @@ static bool foldLoadsRecursive(Value *V, LoadOps &LOps, const DataLayout &DL,
   if (IsBigEndian)
     std::swap(ShAmt1, ShAmt2);
 
-  // Find Shifts values.
-  uint64_t Shift1 = 0, Shift2 = 0;
-  if (ShAmt1)
-    Shift1 = ShAmt1->getZExtValue();
-  if (ShAmt2)
-    Shift2 = ShAmt2->getZExtValue();
-
   // First load is always LI1. This is where we put the new load.
   // Use the merged load size available from LI1 for forward loads.
   if (LOps.FoundRoot) {
@@ -747,7 +737,7 @@ static bool foldLoadsRecursive(Value *V, LoadOps &LOps, const DataLayout &DL,
   uint64_t ShiftDiff = IsBigEndian ? LoadSize2 : LoadSize1;
   uint64_t PrevSize =
       DL.getTypeStoreSize(IntegerType::get(LI1->getContext(), LoadSize1));
-  if ((Shift2 - Shift1) != ShiftDiff || (Offset2 - Offset1) != PrevSize)
+  if ((ShAmt2 - ShAmt1) != ShiftDiff || (Offset2 - Offset1) != PrevSize)
     return false;
 
   // Update LOps
@@ -824,7 +814,7 @@ static bool foldConsecutiveLoads(Instruction &I, const DataLayout &DL,
   // Check if shift needed. We need to shift with the amount of load1
   // shift if not zero.
   if (LOps.Shift)
-    NewOp = Builder.CreateShl(NewOp, ConstantInt::get(I.getContext(), *LOps.Shift));
+    NewOp = Builder.CreateShl(NewOp, LOps.Shift);
   I.replaceAllUsesWith(NewOp);
 
   return true;
@@ -860,11 +850,9 @@ static std::optional<PartStore> matchPartStore(Instruction &I,
     return std::nullopt;
 
   uint64_t ValWidth = StoredTy->getPrimitiveSizeInBits();
-  uint64_t ValOffset = 0;
+  uint64_t ValOffset;
   Value *Val;
-  if (!match(StoredVal, m_CombineOr(m_Trunc(m_LShr(m_Value(Val),
-                                                   m_ConstantInt(ValOffset))),
-                                    m_Trunc(m_Value(Val)))))
+  if (!match(StoredVal, m_Trunc(m_LShrOrSelf(m_Value(Val), ValOffset))))
     return std::nullopt;
 
   Value *Ptr = Store->getPointerOperand();
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index b775c4346019..08f03aa45255 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -700,9 +700,6 @@ static void buildFrameDebugInfo(Function &F, coro::Shape &Shape,
 
   DIBuilder DBuilder(*F.getParent(), /*AllowUnresolved*/ false);
 
-  assert(Shape.getPromiseAlloca() &&
-         "Coroutine with switch ABI should own Promise alloca");
-
   DIFile *DFile = DIS->getFile();
   unsigned LineNum = DIS->getLine();
 
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index 180ac9c61e7d..02c38d02cff6 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -1568,14 +1568,22 @@ private:
         if (DebugLoc SuspendLoc = S->getDebugLoc()) {
           std::string LabelName =
               ("__coro_resume_" + Twine(SuspendIndex)).str();
-          DILocation &DILoc = *SuspendLoc;
+          // Take the "inlined at" location recursively, if present. This is
+          // mandatory as the DILabel insertion checks that the scopes of label
+          // and the attached location match. This is not the case when the
+          // suspend location has been inlined due to pointing to the original
+          // scope.
+          DILocation *DILoc = SuspendLoc;
+          while (DILocation *InlinedAt = DILoc->getInlinedAt())
+            DILoc = InlinedAt;
+
           DILabel *ResumeLabel =
-              DBuilder.createLabel(DIS, LabelName, DILoc.getFile(),
+              DBuilder.createLabel(DIS, LabelName, DILoc->getFile(),
                                    SuspendLoc.getLine(), SuspendLoc.getCol(),
                                    /*IsArtificial=*/true,
                                    /*CoroSuspendIdx=*/SuspendIndex,
                                    /*AlwaysPreserve=*/false);
-          DBuilder.insertLabel(ResumeLabel, &DILoc, ResumeBB->begin());
+          DBuilder.insertLabel(ResumeLabel, DILoc, ResumeBB->begin());
         }
       }
 
diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp
index 7bcb20de46ff..83aa7de5400f 100644
--- a/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -40,6 +40,7 @@
 #include "llvm/Support/JSON.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TimeProfiler.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO/Internalize.h"
 #include "llvm/Transforms/Utils/Cloning.h"
@@ -1550,6 +1551,7 @@ void llvm::computeDeadSymbolsWithConstProp(
     const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols,
     function_ref<PrevailingType(GlobalValue::GUID)> isPrevailing,
     bool ImportEnabled) {
+  llvm::TimeTraceScope timeScope("Drop dead symbols and propagate attributes");
   computeDeadSymbolsAndUpdateIndirectCalls(Index, GUIDPreservedSymbols,
                                            isPrevailing);
   if (ImportEnabled)
@@ -1664,6 +1666,7 @@ bool llvm::convertToDeclaration(GlobalValue &GV) {
 void llvm::thinLTOFinalizeInModule(Module &TheModule,
                                    const GVSummaryMapTy &DefinedGlobals,
                                    bool PropagateAttrs) {
+  llvm::TimeTraceScope timeScope("ThinLTO finalize in module");
   DenseSet<Comdat *> NonPrevailingComdats;
   auto FinalizeInModule = [&](GlobalValue &GV, bool Propagate = false) {
     // See if the global summary analysis computed a new resolved linkage.
@@ -1791,6 +1794,7 @@ void llvm::thinLTOFinalizeInModule(Module &TheModule,
 /// Run internalization on \p TheModule based on symmary analysis.
 void llvm::thinLTOInternalizeModule(Module &TheModule,
                                     const GVSummaryMapTy &DefinedGlobals) {
+  llvm::TimeTraceScope timeScope("ThinLTO internalize module");
   // Declare a callback for the internalize pass that will ask for every
   // candidate GlobalValue if it can be internalized or not.
   auto MustPreserveGV = [&](const GlobalValue &GV) -> bool {
@@ -1885,6 +1889,7 @@ Expected<bool> FunctionImporter::importFunctions(
 
   // Do the actual import of functions now, one Module at a time
   for (const auto &ModName : ImportList.getSourceModules()) {
+    llvm::TimeTraceScope timeScope("Import", ModName);
     // Get the module for the import
     Expected<std::unique_ptr<Module>> SrcModuleOrErr = ModuleLoader(ModName);
     if (!SrcModuleOrErr)
@@ -1900,102 +1905,114 @@ Expected<bool> FunctionImporter::importFunctions(
 
     // Find the globals to import
     SetVector<GlobalValue *> GlobalsToImport;
-    for (Function &F : *SrcModule) {
-      if (!F.hasName())
-        continue;
-      auto GUID = F.getGUID();
-      auto MaybeImportType = ImportList.getImportType(ModName, GUID);
-      bool ImportDefinition = MaybeImportType == GlobalValueSummary::Definition;
-
-      LLVM_DEBUG(dbgs() << (MaybeImportType ? "Is" : "Not")
-                        << " importing function"
-                        << (ImportDefinition
-                                ? " definition "
-                                : (MaybeImportType ? " declaration " : " "))
-                        << GUID << " " << F.getName() << " from "
-                        << SrcModule->getSourceFileName() << "\n");
-      if (ImportDefinition) {
-        if (Error Err = F.materialize())
-          return std::move(Err);
-        // MemProf should match function's definition and summary,
-        // 'thinlto_src_module' is needed.
-        if (EnableImportMetadata || EnableMemProfContextDisambiguation) {
-          // Add 'thinlto_src_module' and 'thinlto_src_file' metadata for
-          // statistics and debugging.
-          F.setMetadata(
-              "thinlto_src_module",
-              MDNode::get(DestModule.getContext(),
-                          {MDString::get(DestModule.getContext(),
-                                         SrcModule->getModuleIdentifier())}));
-          F.setMetadata(
-              "thinlto_src_file",
-              MDNode::get(DestModule.getContext(),
-                          {MDString::get(DestModule.getContext(),
-                                         SrcModule->getSourceFileName())}));
+    {
+      llvm::TimeTraceScope functionsScope("Functions");
+      for (Function &F : *SrcModule) {
+        if (!F.hasName())
+          continue;
+        auto GUID = F.getGUID();
+        auto MaybeImportType = ImportList.getImportType(ModName, GUID);
+        bool ImportDefinition =
+            MaybeImportType == GlobalValueSummary::Definition;
+
+        LLVM_DEBUG(dbgs() << (MaybeImportType ? "Is" : "Not")
+                          << " importing function"
+                          << (ImportDefinition
+                                  ? " definition "
+                                  : (MaybeImportType ? " declaration " : " "))
+                          << GUID << " " << F.getName() << " from "
+                          << SrcModule->getSourceFileName() << "\n");
+        if (ImportDefinition) {
+          if (Error Err = F.materialize())
+            return std::move(Err);
+          // MemProf should match function's definition and summary,
+          // 'thinlto_src_module' is needed.
+          if (EnableImportMetadata || EnableMemProfContextDisambiguation) {
+            // Add 'thinlto_src_module' and 'thinlto_src_file' metadata for
+            // statistics and debugging.
+            F.setMetadata(
+                "thinlto_src_module",
+                MDNode::get(DestModule.getContext(),
+                            {MDString::get(DestModule.getContext(),
+                                           SrcModule->getModuleIdentifier())}));
+            F.setMetadata(
+                "thinlto_src_file",
+                MDNode::get(DestModule.getContext(),
+                            {MDString::get(DestModule.getContext(),
+                                           SrcModule->getSourceFileName())}));
+          }
+          GlobalsToImport.insert(&F);
         }
-        GlobalsToImport.insert(&F);
       }
     }
-    for (GlobalVariable &GV : SrcModule->globals()) {
-      if (!GV.hasName())
-        continue;
-      auto GUID = GV.getGUID();
-      auto MaybeImportType = ImportList.getImportType(ModName, GUID);
-      bool ImportDefinition = MaybeImportType == GlobalValueSummary::Definition;
-
-      LLVM_DEBUG(dbgs() << (MaybeImportType ? "Is" : "Not")
-                        << " importing global"
-                        << (ImportDefinition
-                                ? " definition "
-                                : (MaybeImportType ? " declaration " : " "))
-                        << GUID << " " << GV.getName() << " from "
-                        << SrcModule->getSourceFileName() << "\n");
-      if (ImportDefinition) {
-        if (Error Err = GV.materialize())
-          return std::move(Err);
-        ImportedGVCount += GlobalsToImport.insert(&GV);
+    {
+      llvm::TimeTraceScope globalsScope("Globals");
+      for (GlobalVariable &GV : SrcModule->globals()) {
+        if (!GV.hasName())
+          continue;
+        auto GUID = GV.getGUID();
+        auto MaybeImportType = ImportList.getImportType(ModName, GUID);
+        bool ImportDefinition =
+            MaybeImportType == GlobalValueSummary::Definition;
+
+        LLVM_DEBUG(dbgs() << (MaybeImportType ? "Is" : "Not")
+                          << " importing global"
+                          << (ImportDefinition
+                                  ? " definition "
+                                  : (MaybeImportType ? " declaration " : " "))
+                          << GUID << " " << GV.getName() << " from "
+                          << SrcModule->getSourceFileName() << "\n");
+        if (ImportDefinition) {
+          if (Error Err = GV.materialize())
+            return std::move(Err);
+          ImportedGVCount += GlobalsToImport.insert(&GV);
+        }
       }
     }
-    for (GlobalAlias &GA : SrcModule->aliases()) {
-      if (!GA.hasName() || isa<GlobalIFunc>(GA.getAliaseeObject()))
-        continue;
-      auto GUID = GA.getGUID();
-      auto MaybeImportType = ImportList.getImportType(ModName, GUID);
-      bool ImportDefinition = MaybeImportType == GlobalValueSummary::Definition;
-
-      LLVM_DEBUG(dbgs() << (MaybeImportType ? "Is" : "Not")
-                        << " importing alias"
-                        << (ImportDefinition
-                                ? " definition "
-                                : (MaybeImportType ? " declaration " : " "))
-                        << GUID << " " << GA.getName() << " from "
-                        << SrcModule->getSourceFileName() << "\n");
-      if (ImportDefinition) {
-        if (Error Err = GA.materialize())
-          return std::move(Err);
-        // Import alias as a copy of its aliasee.
-        GlobalObject *GO = GA.getAliaseeObject();
-        if (Error Err = GO->materialize())
-          return std::move(Err);
-        auto *Fn = replaceAliasWithAliasee(SrcModule.get(), &GA);
-        LLVM_DEBUG(dbgs() << "Is importing aliasee fn " << GO->getGUID() << " "
-                          << GO->getName() << " from "
+    {
+      llvm::TimeTraceScope aliasesScope("Aliases");
+      for (GlobalAlias &GA : SrcModule->aliases()) {
+        if (!GA.hasName() || isa<GlobalIFunc>(GA.getAliaseeObject()))
+          continue;
+        auto GUID = GA.getGUID();
+        auto MaybeImportType = ImportList.getImportType(ModName, GUID);
+        bool ImportDefinition =
+            MaybeImportType == GlobalValueSummary::Definition;
+
+        LLVM_DEBUG(dbgs() << (MaybeImportType ? "Is" : "Not")
+                          << " importing alias"
+                          << (ImportDefinition
+                                  ? " definition "
+                                  : (MaybeImportType ? " declaration " : " "))
+                          << GUID << " " << GA.getName() << " from "
                           << SrcModule->getSourceFileName() << "\n");
-        if (EnableImportMetadata || EnableMemProfContextDisambiguation) {
-          // Add 'thinlto_src_module' and 'thinlto_src_file' metadata for
-          // statistics and debugging.
-          Fn->setMetadata(
-              "thinlto_src_module",
-              MDNode::get(DestModule.getContext(),
-                          {MDString::get(DestModule.getContext(),
-                                         SrcModule->getModuleIdentifier())}));
-          Fn->setMetadata(
-              "thinlto_src_file",
-              MDNode::get(DestModule.getContext(),
-                          {MDString::get(DestModule.getContext(),
-                                         SrcModule->getSourceFileName())}));
+        if (ImportDefinition) {
+          if (Error Err = GA.materialize())
+            return std::move(Err);
+          // Import alias as a copy of its aliasee.
+          GlobalObject *GO = GA.getAliaseeObject();
+          if (Error Err = GO->materialize())
+            return std::move(Err);
+          auto *Fn = replaceAliasWithAliasee(SrcModule.get(), &GA);
+          LLVM_DEBUG(dbgs() << "Is importing aliasee fn " << GO->getGUID()
+                            << " " << GO->getName() << " from "
+                            << SrcModule->getSourceFileName() << "\n");
+          if (EnableImportMetadata || EnableMemProfContextDisambiguation) {
+            // Add 'thinlto_src_module' and 'thinlto_src_file' metadata for
+            // statistics and debugging.
+            Fn->setMetadata(
+                "thinlto_src_module",
+                MDNode::get(DestModule.getContext(),
+                            {MDString::get(DestModule.getContext(),
+                                           SrcModule->getModuleIdentifier())}));
+            Fn->setMetadata(
+                "thinlto_src_file",
+                MDNode::get(DestModule.getContext(),
+                            {MDString::get(DestModule.getContext(),
+                                           SrcModule->getSourceFileName())}));
+          }
+          GlobalsToImport.insert(Fn);
         }
-        GlobalsToImport.insert(Fn);
       }
     }
 
diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index 9196a0147c43..30459caee160 100644
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -89,6 +89,8 @@ static cl::opt<bool> SpecializeLiteralConstant(
         "Enable specialization of functions that take a literal constant as an "
         "argument"));
 
+extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+
 bool InstCostVisitor::canEliminateSuccessor(BasicBlock *BB,
                                             BasicBlock *Succ) const {
   unsigned I = 0;
@@ -784,9 +786,31 @@ bool FunctionSpecializer::run() {
 
     // Update the known call sites to call the clone.
     for (CallBase *Call : S.CallSites) {
+      Function *Clone = S.Clone;
       LLVM_DEBUG(dbgs() << "FnSpecialization: Redirecting " << *Call
-                        << " to call " << S.Clone->getName() << "\n");
+                        << " to call " << Clone->getName() << "\n");
       Call->setCalledFunction(S.Clone);
+      auto &BFI = GetBFI(*Call->getFunction());
+      std::optional<uint64_t> Count =
+          BFI.getBlockProfileCount(Call->getParent());
+      if (Count && !ProfcheckDisableMetadataFixes) {
+        std::optional<llvm::Function::ProfileCount> MaybeCloneCount =
+            Clone->getEntryCount();
+        assert(MaybeCloneCount && "Clone entry count was not set!");
+        uint64_t CallCount = *Count + MaybeCloneCount->getCount();
+        Clone->setEntryCount(CallCount);
+        if (std::optional<llvm::Function::ProfileCount> MaybeOriginalCount =
+                S.F->getEntryCount()) {
+          uint64_t OriginalCount = MaybeOriginalCount->getCount();
+          if (OriginalCount >= CallCount) {
+            S.F->setEntryCount(OriginalCount - CallCount);
+          } else {
+            // This should generally not happen as that would mean there are
+            // more computed calls to the function than what was recorded.
+            LLVM_DEBUG(S.F->setEntryCount(0));
+          }
+        }
+      }
     }
 
     Clones.push_back(S.Clone);
@@ -838,14 +862,24 @@ bool FunctionSpecializer::run() {
 }
 
 void FunctionSpecializer::removeDeadFunctions() {
-  for (Function *F : FullySpecialized) {
+  for (Function *F : DeadFunctions) {
     LLVM_DEBUG(dbgs() << "FnSpecialization: Removing dead function "
                       << F->getName() << "\n");
     if (FAM)
       FAM->clear(*F, F->getName());
+
+    // Remove all the callsites that were proven unreachable once, and replace
+    // them with poison.
+    for (User *U : make_early_inc_range(F->users())) {
+      assert((isa<CallInst>(U) || isa<InvokeInst>(U)) &&
+             "User of dead function must be call or invoke");
+      Instruction *CS = cast<Instruction>(U);
+      CS->replaceAllUsesWith(PoisonValue::get(CS->getType()));
+      CS->eraseFromParent();
+    }
     F->eraseFromParent();
   }
-  FullySpecialized.clear();
+  DeadFunctions.clear();
 }
 
 /// Clone the function \p F and remove the ssa_copy intrinsics added by
@@ -1033,6 +1067,9 @@ Function *FunctionSpecializer::createSpecialization(Function *F,
   // clone must.
   Clone->setLinkage(GlobalValue::InternalLinkage);
 
+  if (F->getEntryCount() && !ProfcheckDisableMetadataFixes)
+    Clone->setEntryCount(0);
+
   // Initialize the lattice state of the arguments of the function clone,
   // marking the argument on which we specialized the function constant
   // with the given value.
@@ -1206,8 +1243,11 @@ void FunctionSpecializer::updateCallSites(Function *F, const Spec *Begin,
 
   // If the function has been completely specialized, the original function
   // is no longer needed. Mark it unreachable.
-  if (NCallsLeft == 0 && Solver.isArgumentTrackedFunction(F)) {
+  // NOTE: If the address of a function is taken, we cannot treat it as dead
+  // function.
+  if (NCallsLeft == 0 && Solver.isArgumentTrackedFunction(F) &&
+      !F->hasAddressTaken()) {
     Solver.markFunctionUnreachable(F);
-    FullySpecialized.insert(F);
+    DeadFunctions.insert(F);
   }
 }
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index d7edd1288309..f88d51f443bc 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -2551,7 +2551,8 @@ static bool OptimizeNonTrivialIFuncs(
         }))
       continue;
 
-    assert(!Callees.empty() && "Expecting successful collection of versions");
+    if (Callees.empty())
+      continue;
 
     LLVM_DEBUG(dbgs() << "Statically resolving calls to function "
                       << Resolver->getName() << "\n");
diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp
index c57981ae4ca0..fdf0c3ac8007 100644
--- a/llvm/lib/Transforms/IPO/IROutliner.cpp
+++ b/llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -686,9 +686,6 @@ Function *IROutliner::createFunction(Module &M, OutlinableGroup &Group,
         /* Outlined code is optimized code by definition. */
         DISubprogram::SPFlagDefinition | DISubprogram::SPFlagOptimized);
 
-    // Don't add any new variables to the subprogram.
-    DB.finalizeSubprogram(OutlinedSP);
-
     // Attach subprogram to the function.
     F->setSubprogram(OutlinedSP);
     // We're done with the DIBuilder.
diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index 57844a10aa9c..821a9d82ddb0 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -504,10 +504,7 @@ class LowerTypeTestsModule {
   void importTypeTest(CallInst *CI);
   void importFunction(Function *F, bool isJumpTableCanonical);
 
-  BitSetInfo
-  buildBitSet(Metadata *TypeId,
-              const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout);
-  ByteArrayInfo *createByteArray(BitSetInfo &BSI);
+  ByteArrayInfo *createByteArray(const BitSetInfo &BSI);
   void allocateByteArrays();
   Value *createBitSetTest(IRBuilder<> &B, const TypeIdLowering &TIL,
                           Value *BitOffset);
@@ -578,9 +575,9 @@ public:
 
 /// Build a bit set for TypeId using the object layouts in
 /// GlobalLayout.
-BitSetInfo LowerTypeTestsModule::buildBitSet(
-    Metadata *TypeId,
-    const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout) {
+static BitSetInfo
+buildBitSet(Metadata *TypeId,
+            const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout) {
   BitSetBuilder BSB;
 
   // Compute the byte offset of each address associated with this type
@@ -615,7 +612,7 @@ static Value *createMaskedBitTest(IRBuilder<> &B, Value *Bits,
   return B.CreateICmpNE(MaskedBits, ConstantInt::get(BitsType, 0));
 }
 
-ByteArrayInfo *LowerTypeTestsModule::createByteArray(BitSetInfo &BSI) {
+ByteArrayInfo *LowerTypeTestsModule::createByteArray(const BitSetInfo &BSI) {
   // Create globals to stand in for byte arrays and masks. These never actually
   // get initialized, we RAUW and erase them later in allocateByteArrays() once
   // we know the offset and mask to use.
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index b8c99f1f3389..7f9693169af0 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -3965,6 +3965,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
 void ModuleCallsiteContextGraph::updateAllocationCall(
     CallInfo &Call, AllocationType AllocType) {
   std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
+  removeAnyExistingAmbiguousAttribute(cast<CallBase>(Call.call()));
   auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
                                 "memprof", AllocTypeString);
   cast<CallBase>(Call.call())->addFnAttr(A);
@@ -5501,6 +5502,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
               // clone J-1 (J==0 is the original clone and does not have a VMaps
               // entry).
               CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
+            removeAnyExistingAmbiguousAttribute(CBClone);
             CBClone->addFnAttr(A);
             ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
                      << ore::NV("AllocationCall", CBClone) << " in clone "
diff --git a/llvm/lib/Transforms/IPO/SCCP.cpp b/llvm/lib/Transforms/IPO/SCCP.cpp
index d50de34dfa48..2ecadd529170 100644
--- a/llvm/lib/Transforms/IPO/SCCP.cpp
+++ b/llvm/lib/Transforms/IPO/SCCP.cpp
@@ -169,6 +169,13 @@ static bool runIPSCCP(
   for (Function &F : M) {
     if (F.isDeclaration())
       continue;
+    // Skip the dead functions marked by FunctionSpecializer, avoiding removing
+    // blocks in dead functions. Set MadeChanges if there is any dead function
+    // that will be removed later.
+    if (IsFuncSpecEnabled && Specializer.isDeadFunction(&F)) {
+      MadeChanges = true;
+      continue;
+    }
 
     SmallVector<BasicBlock *, 512> BlocksToErase;
 
@@ -326,12 +333,15 @@ static bool runIPSCCP(
     LLVM_DEBUG(dbgs() << "Found that GV '" << GV->getName()
                       << "' is constant!\n");
     for (User *U : make_early_inc_range(GV->users())) {
-      // We can remove LoadInst here, because we already replaced its users
-      // with a constant.
+      // We can remove LoadInst here. The LoadInsts in dead functions marked by
+      // FuncSpec are not simplified to constants, thus poison them.
       assert((isa<StoreInst>(U) || isa<LoadInst>(U)) &&
              "Only Store|Load Instruction can be user of GlobalVariable at "
              "reaching here.");
-      cast<Instruction>(U)->eraseFromParent();
+      Instruction *I = cast<Instruction>(U);
+      if (isa<LoadInst>(I))
+        I->replaceAllUsesWith(PoisonValue::get(I->getType()));
+      I->eraseFromParent();
     }
 
     // Try to create a debug constant expression for the global variable
diff --git a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index 838f97c8f49a..2340fe556538 100644
--- a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -269,6 +269,12 @@ static bool enableUnifiedLTO(Module &M) {
 }
 #endif
 
+bool mustEmitToMergedModule(const GlobalValue *GV) {
+  // The __cfi_check definition is filled in by the CrossDSOCFI pass which
+  // runs only in the merged module.
+  return GV->getName() == "__cfi_check";
+}
+
 // If it's possible to split M into regular and thin LTO parts, do so and write
 // a multi-module bitcode file with the two parts to OS. Otherwise, write only a
 // regular LTO bitcode file to OS.
@@ -350,19 +356,13 @@ void splitAndWriteThinLTOBitcode(
       });
     }
 
-  auto MustEmitToMergedModule = [](const GlobalValue *GV) {
-    // The __cfi_check definition is filled in by the CrossDSOCFI pass which
-    // runs only in the merged module.
-    return GV->getName() == "__cfi_check";
-  };
-
   ValueToValueMapTy VMap;
   std::unique_ptr<Module> MergedM(
       CloneModule(M, VMap, [&](const GlobalValue *GV) -> bool {
         if (const auto *C = GV->getComdat())
           if (MergedMComdats.count(C))
             return true;
-        if (MustEmitToMergedModule(GV))
+        if (mustEmitToMergedModule(GV))
           return true;
         if (auto *F = dyn_cast<Function>(GV))
           return EligibleVirtualFns.count(F);
@@ -380,7 +380,7 @@ void splitAndWriteThinLTOBitcode(
   cloneUsedGlobalVariables(M, *MergedM, /*CompilerUsed*/ true);
 
   for (Function &F : *MergedM)
-    if (!F.isDeclaration() && !MustEmitToMergedModule(&F)) {
+    if (!F.isDeclaration() && !mustEmitToMergedModule(&F)) {
       // Reset the linkage of all functions eligible for virtual constant
       // propagation. The canonical definitions live in the thin LTO module so
       // that they can be imported.
@@ -406,7 +406,7 @@ void splitAndWriteThinLTOBitcode(
     if (const auto *C = GV->getComdat())
       if (MergedMComdats.count(C))
         return false;
-    if (MustEmitToMergedModule(GV))
+    if (mustEmitToMergedModule(GV))
       return false;
     return true;
   });
@@ -529,11 +529,13 @@ bool enableSplitLTOUnit(Module &M) {
   return EnableSplitLTOUnit;
 }
 
-// Returns whether this module needs to be split because it uses type metadata.
-bool hasTypeMetadata(Module &M) {
+// Returns whether this module needs to be split (if splitting is enabled).
+bool requiresSplit(Module &M) {
   for (auto &GO : M.global_objects()) {
     if (GO.hasMetadata(LLVMContext::MD_type))
       return true;
+    if (mustEmitToMergedModule(&GO))
+      return true;
   }
   return false;
 }
@@ -543,9 +545,9 @@ bool writeThinLTOBitcode(raw_ostream &OS, raw_ostream *ThinLinkOS,
                          Module &M, const ModuleSummaryIndex *Index,
                          const bool ShouldPreserveUseListOrder) {
   std::unique_ptr<ModuleSummaryIndex> NewIndex = nullptr;
-  // See if this module has any type metadata. If so, we try to split it
+  // See if this module needs to be split. If so, we try to split it
   // or at least promote type ids to enable WPD.
-  if (hasTypeMetadata(M)) {
+  if (requiresSplit(M)) {
     if (enableSplitLTOUnit(M)) {
       splitAndWriteThinLTOBitcode(OS, ThinLinkOS, AARGetter, M,
                                   ShouldPreserveUseListOrder);
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index aec484f8a18f..bfb25c806e53 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -60,6 +60,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TypeMetadataUtils.h"
 #include "llvm/Bitcode/BitcodeReader.h"
@@ -68,6 +69,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalAlias.h"
@@ -82,12 +84,15 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ModuleSummaryIndexYAML.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/ProfDataUtils.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/GlobPattern.h"
+#include "llvm/Support/TimeProfiler.h"
 #include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/FunctionAttrs.h"
@@ -95,6 +100,7 @@
 #include "llvm/Transforms/Utils/CallPromotionUtils.h"
 #include "llvm/Transforms/Utils/Evaluator.h"
 #include <algorithm>
+#include <cmath>
 #include <cstddef>
 #include <map>
 #include <set>
@@ -167,6 +173,8 @@ static cl::list<std::string>
                       cl::desc("Prevent function(s) from being devirtualized"),
                       cl::Hidden, cl::CommaSeparated);
 
+extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+
 /// With Clang, a pure virtual class's deleting destructor is emitted as a
 /// `llvm.trap` intrinsic followed by an unreachable IR instruction. In the
 /// context of whole program devirtualization, the deleting destructor of a pure
@@ -451,21 +459,21 @@ struct VirtualCallSite {
 
   void
   emitRemark(const StringRef OptName, const StringRef TargetName,
-             function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
+             function_ref<OptimizationRemarkEmitter &(Function &)> OREGetter) {
     Function *F = CB.getCaller();
     DebugLoc DLoc = CB.getDebugLoc();
     BasicBlock *Block = CB.getParent();
 
     using namespace ore;
-    OREGetter(F).emit(OptimizationRemark(DEBUG_TYPE, OptName, DLoc, Block)
-                      << NV("Optimization", OptName)
-                      << ": devirtualized a call to "
-                      << NV("FunctionName", TargetName));
+    OREGetter(*F).emit(OptimizationRemark(DEBUG_TYPE, OptName, DLoc, Block)
+                       << NV("Optimization", OptName)
+                       << ": devirtualized a call to "
+                       << NV("FunctionName", TargetName));
   }
 
   void replaceAndErase(
       const StringRef OptName, const StringRef TargetName, bool RemarksEnabled,
-      function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter,
+      function_ref<OptimizationRemarkEmitter &(Function &)> OREGetter,
       Value *New) {
     if (RemarksEnabled)
       emitRemark(OptName, TargetName, OREGetter);
@@ -570,25 +578,24 @@ void VTableSlotInfo::addCallSite(Value *VTable, CallBase &CB,
 
 struct DevirtModule {
   Module &M;
-  function_ref<AAResults &(Function &)> AARGetter;
-  function_ref<DominatorTree &(Function &)> LookupDomTree;
+  ModuleAnalysisManager &MAM;
+  FunctionAnalysisManager &FAM;
 
-  ModuleSummaryIndex *ExportSummary;
-  const ModuleSummaryIndex *ImportSummary;
+  ModuleSummaryIndex *const ExportSummary;
+  const ModuleSummaryIndex *const ImportSummary;
 
-  IntegerType *Int8Ty;
-  PointerType *Int8PtrTy;
-  IntegerType *Int32Ty;
-  IntegerType *Int64Ty;
-  IntegerType *IntPtrTy;
+  IntegerType *const Int8Ty;
+  PointerType *const Int8PtrTy;
+  IntegerType *const Int32Ty;
+  IntegerType *const Int64Ty;
+  IntegerType *const IntPtrTy;
   /// Sizeless array type, used for imported vtables. This provides a signal
   /// to analyzers that these imports may alias, as they do for example
   /// when multiple unique return values occur in the same vtable.
-  ArrayType *Int8Arr0Ty;
-
-  bool RemarksEnabled;
-  function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
+  ArrayType *const Int8Arr0Ty;
 
+  const bool RemarksEnabled;
+  std::function<OptimizationRemarkEmitter &(Function &)> OREGetter;
   MapVector<VTableSlot, VTableSlotInfo> CallSlots;
 
   // Calls that have already been optimized. We may add a call to multiple
@@ -611,12 +618,11 @@ struct DevirtModule {
   std::map<CallInst *, unsigned> NumUnsafeUsesForTypeTest;
   PatternList FunctionsToSkip;
 
-  DevirtModule(Module &M, function_ref<AAResults &(Function &)> AARGetter,
-               function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter,
-               function_ref<DominatorTree &(Function &)> LookupDomTree,
+  DevirtModule(Module &M, ModuleAnalysisManager &MAM,
                ModuleSummaryIndex *ExportSummary,
                const ModuleSummaryIndex *ImportSummary)
-      : M(M), AARGetter(AARGetter), LookupDomTree(LookupDomTree),
+      : M(M), MAM(MAM),
+        FAM(MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager()),
         ExportSummary(ExportSummary), ImportSummary(ImportSummary),
         Int8Ty(Type::getInt8Ty(M.getContext())),
         Int8PtrTy(PointerType::getUnqual(M.getContext())),
@@ -624,7 +630,10 @@ struct DevirtModule {
         Int64Ty(Type::getInt64Ty(M.getContext())),
         IntPtrTy(M.getDataLayout().getIntPtrType(M.getContext(), 0)),
         Int8Arr0Ty(ArrayType::get(Type::getInt8Ty(M.getContext()), 0)),
-        RemarksEnabled(areRemarksEnabled()), OREGetter(OREGetter) {
+        RemarksEnabled(areRemarksEnabled()),
+        OREGetter([&](Function &F) -> OptimizationRemarkEmitter & {
+          return FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+        }) {
     assert(!(ExportSummary && ImportSummary));
     FunctionsToSkip.init(SkipFunctionNames);
   }
@@ -653,7 +662,7 @@ struct DevirtModule {
                            VTableSlotInfo &SlotInfo,
                            WholeProgramDevirtResolution *Res);
 
-  void applyICallBranchFunnel(VTableSlotInfo &SlotInfo, Constant *JT,
+  void applyICallBranchFunnel(VTableSlotInfo &SlotInfo, Function &JT,
                               bool &IsExported);
   void tryICallBranchFunnel(MutableArrayRef<VirtualCallTarget> TargetsForSlot,
                             VTableSlotInfo &SlotInfo,
@@ -738,10 +747,7 @@ struct DevirtModule {
 
   // Lower the module using the action and summary passed as command line
   // arguments. For testing purposes only.
-  static bool
-  runForTesting(Module &M, function_ref<AAResults &(Function &)> AARGetter,
-                function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter,
-                function_ref<DominatorTree &(Function &)> LookupDomTree);
+  static bool runForTesting(Module &M, ModuleAnalysisManager &MAM);
 };
 
 struct DevirtIndex {
@@ -782,25 +788,13 @@ struct DevirtIndex {
 } // end anonymous namespace
 
 PreservedAnalyses WholeProgramDevirtPass::run(Module &M,
-                                              ModuleAnalysisManager &AM) {
-  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
-  auto AARGetter = [&](Function &F) -> AAResults & {
-    return FAM.getResult<AAManager>(F);
-  };
-  auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
-    return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
-  };
-  auto LookupDomTree = [&FAM](Function &F) -> DominatorTree & {
-    return FAM.getResult<DominatorTreeAnalysis>(F);
-  };
+                                              ModuleAnalysisManager &MAM) {
   if (UseCommandLine) {
-    if (!DevirtModule::runForTesting(M, AARGetter, OREGetter, LookupDomTree))
+    if (!DevirtModule::runForTesting(M, MAM))
       return PreservedAnalyses::all();
     return PreservedAnalyses::none();
   }
-  if (!DevirtModule(M, AARGetter, OREGetter, LookupDomTree, ExportSummary,
-                    ImportSummary)
-           .run())
+  if (!DevirtModule(M, MAM, ExportSummary, ImportSummary).run())
     return PreservedAnalyses::all();
   return PreservedAnalyses::none();
 }
@@ -832,8 +826,8 @@ typeIDVisibleToRegularObj(StringRef TypeID,
   // function for the base type and thus only contains a reference to the
   // type info (_ZTI). To catch this case we query using the type info
   // symbol corresponding to the TypeID.
-  std::string typeInfo = ("_ZTI" + TypeID).str();
-  return IsVisibleToRegularObj(typeInfo);
+  std::string TypeInfo = ("_ZTI" + TypeID).str();
+  return IsVisibleToRegularObj(TypeInfo);
 }
 
 static bool
@@ -842,7 +836,7 @@ skipUpdateDueToValidation(GlobalVariable &GV,
   SmallVector<MDNode *, 2> Types;
   GV.getMetadata(LLVMContext::MD_type, Types);
 
-  for (auto Type : Types)
+  for (auto *Type : Types)
     if (auto *TypeID = dyn_cast<MDString>(Type->getOperand(1).get()))
       return typeIDVisibleToRegularObj(TypeID->getString(),
                                        IsVisibleToRegularObj);
@@ -881,6 +875,7 @@ void llvm::updateVCallVisibilityInModule(
 
 void llvm::updatePublicTypeTestCalls(Module &M,
                                      bool WholeProgramVisibilityEnabledInLTO) {
+  llvm::TimeTraceScope timeScope("Update public type test calls");
   Function *PublicTypeTestFunc =
       Intrinsic::getDeclarationIfExists(&M, Intrinsic::public_type_test);
   if (!PublicTypeTestFunc)
@@ -912,9 +907,9 @@ void llvm::getVisibleToRegularObjVtableGUIDs(
     ModuleSummaryIndex &Index,
     DenseSet<GlobalValue::GUID> &VisibleToRegularObjSymbols,
     function_ref<bool(StringRef)> IsVisibleToRegularObj) {
-  for (const auto &typeID : Index.typeIdCompatibleVtableMap()) {
-    if (typeIDVisibleToRegularObj(typeID.first, IsVisibleToRegularObj))
-      for (const TypeIdOffsetVtableInfo &P : typeID.second)
+  for (const auto &TypeID : Index.typeIdCompatibleVtableMap()) {
+    if (typeIDVisibleToRegularObj(TypeID.first, IsVisibleToRegularObj))
+      for (const TypeIdOffsetVtableInfo &P : TypeID.second)
         VisibleToRegularObjSymbols.insert(P.VTableVI.getGUID());
   }
 }
@@ -957,7 +952,7 @@ void llvm::runWholeProgramDevirtOnIndex(
 
 void llvm::updateIndexWPDForExports(
     ModuleSummaryIndex &Summary,
-    function_ref<bool(StringRef, ValueInfo)> isExported,
+    function_ref<bool(StringRef, ValueInfo)> IsExported,
     std::map<ValueInfo, std::vector<VTableSlotSummary>> &LocalWPDTargetsMap) {
   for (auto &T : LocalWPDTargetsMap) {
     auto &VI = T.first;
@@ -965,7 +960,7 @@ void llvm::updateIndexWPDForExports(
     assert(VI.getSummaryList().size() == 1 &&
            "Devirt of local target has more than one copy");
     auto &S = VI.getSummaryList()[0];
-    if (!isExported(S->modulePath(), VI))
+    if (!IsExported(S->modulePath(), VI))
       continue;
 
     // It's been exported by a cross module import.
@@ -995,10 +990,7 @@ static Error checkCombinedSummaryForTesting(ModuleSummaryIndex *Summary) {
   return ErrorSuccess();
 }
 
-bool DevirtModule::runForTesting(
-    Module &M, function_ref<AAResults &(Function &)> AARGetter,
-    function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter,
-    function_ref<DominatorTree &(Function &)> LookupDomTree) {
+bool DevirtModule::runForTesting(Module &M, ModuleAnalysisManager &MAM) {
   std::unique_ptr<ModuleSummaryIndex> Summary =
       std::make_unique<ModuleSummaryIndex>(/*HaveGVs=*/false);
 
@@ -1023,7 +1015,7 @@ bool DevirtModule::runForTesting(
   }
 
   bool Changed =
-      DevirtModule(M, AARGetter, OREGetter, LookupDomTree,
+      DevirtModule(M, MAM,
                    ClSummaryAction == PassSummaryAction::Export ? Summary.get()
                                                                 : nullptr,
                    ClSummaryAction == PassSummaryAction::Import ? Summary.get()
@@ -1071,7 +1063,7 @@ void DevirtModule::buildTypeIdentifierMap(
     }
 
     for (MDNode *Type : Types) {
-      auto TypeID = Type->getOperand(1).get();
+      auto *TypeID = Type->getOperand(1).get();
 
       uint64_t Offset =
           cast<ConstantInt>(
@@ -1120,7 +1112,7 @@ bool DevirtModule::tryFindVirtualCallTargets(
 
     // Save the symbol used in the vtable to use as the devirtualization
     // target.
-    auto GV = dyn_cast<GlobalValue>(C);
+    auto *GV = dyn_cast<GlobalValue>(C);
     assert(GV);
     TargetsForSlot.push_back({GV, &TM});
   }
@@ -1284,7 +1276,7 @@ void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo,
     Apply(P.second);
 }
 
-static bool AddCalls(VTableSlotInfo &SlotInfo, const ValueInfo &Callee) {
+static bool addCalls(VTableSlotInfo &SlotInfo, const ValueInfo &Callee) {
   // We can't add calls if we haven't seen a definition
   if (Callee.getSummaryList().empty())
     return false;
@@ -1359,7 +1351,7 @@ bool DevirtModule::trySingleImplDevirt(
   if (ValueInfo TheFnVI = ExportSummary->getValueInfo(TheFn->getGUID()))
     // Any needed promotion of 'TheFn' has already been done during
     // LTO unit split, so we can ignore return value of AddCalls.
-    AddCalls(SlotInfo, TheFnVI);
+    addCalls(SlotInfo, TheFnVI);
 
   Res->TheKind = WholeProgramDevirtResolution::SingleImpl;
   Res->SingleImplName = std::string(TheFn->getName());
@@ -1400,7 +1392,7 @@ bool DevirtIndex::trySingleImplDevirt(MutableArrayRef<ValueInfo> TargetsForSlot,
     DevirtTargets.insert(TheFn);
 
   auto &S = TheFn.getSummaryList()[0];
-  bool IsExported = AddCalls(SlotInfo, TheFn);
+  bool IsExported = addCalls(SlotInfo, TheFn);
   if (IsExported)
     ExportedGUIDs.insert(TheFn.getGUID());
 
@@ -1497,13 +1489,19 @@ void DevirtModule::tryICallBranchFunnel(
   ReturnInst::Create(M.getContext(), nullptr, BB);
 
   bool IsExported = false;
-  applyICallBranchFunnel(SlotInfo, JT, IsExported);
+  applyICallBranchFunnel(SlotInfo, *JT, IsExported);
   if (IsExported)
     Res->TheKind = WholeProgramDevirtResolution::BranchFunnel;
+
+  if (!JT->getEntryCount().has_value()) {
+    // FIXME: we could pass through thinlto the necessary information.
+    setExplicitlyUnknownFunctionEntryCount(*JT);
+  }
 }
 
 void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo,
-                                          Constant *JT, bool &IsExported) {
+                                          Function &JT, bool &IsExported) {
+  DenseMap<Function *, double> FunctionEntryCounts;
   auto Apply = [&](CallSiteInfo &CSInfo) {
     if (CSInfo.isExported())
       IsExported = true;
@@ -1531,8 +1529,7 @@ void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo,
 
       NumBranchFunnel++;
       if (RemarksEnabled)
-        VCallSite.emitRemark("branch-funnel",
-                             JT->stripPointerCasts()->getName(), OREGetter);
+        VCallSite.emitRemark("branch-funnel", JT.getName(), OREGetter);
 
       // Pass the address of the vtable in the nest register, which is r10 on
       // x86_64.
@@ -1548,11 +1545,28 @@ void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo,
       llvm::append_range(Args, CB.args());
 
       CallBase *NewCS = nullptr;
+      if (!JT.isDeclaration() && !ProfcheckDisableMetadataFixes) {
+        // Accumulate the call frequencies of the original call site, and use
+        // that as total entry count for the funnel function.
+        auto &F = *CB.getCaller();
+        auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
+        auto EC = BFI.getBlockFreq(&F.getEntryBlock());
+        auto CC = F.getEntryCount(/*AllowSynthetic=*/true);
+        double CallCount = 0.0;
+        if (EC.getFrequency() != 0 && CC && CC->getCount() != 0) {
+          double CallFreq =
+              static_cast<double>(
+                  BFI.getBlockFreq(CB.getParent()).getFrequency()) /
+              EC.getFrequency();
+          CallCount = CallFreq * CC->getCount();
+        }
+        FunctionEntryCounts[&JT] += CallCount;
+      }
       if (isa<CallInst>(CB))
-        NewCS = IRB.CreateCall(NewFT, JT, Args);
+        NewCS = IRB.CreateCall(NewFT, &JT, Args);
       else
         NewCS =
-            IRB.CreateInvoke(NewFT, JT, cast<InvokeInst>(CB).getNormalDest(),
+            IRB.CreateInvoke(NewFT, &JT, cast<InvokeInst>(CB).getNormalDest(),
                              cast<InvokeInst>(CB).getUnwindDest(), Args);
       NewCS->setCallingConv(CB.getCallingConv());
 
@@ -1586,6 +1600,11 @@ void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo,
   Apply(SlotInfo.CSInfo);
   for (auto &P : SlotInfo.ConstCSInfo)
     Apply(P.second);
+  for (auto &[F, C] : FunctionEntryCounts) {
+    assert(!F->getEntryCount(/*AllowSynthetic=*/true) &&
+           "Unexpected entry count for funnel that was freshly synthesized");
+    F->setEntryCount(static_cast<uint64_t>(std::round(C)));
+  }
 }
 
 bool DevirtModule::tryEvaluateFunctionsWithArgs(
@@ -1597,7 +1616,7 @@ bool DevirtModule::tryEvaluateFunctionsWithArgs(
     // TODO: Skip for now if the vtable symbol was an alias to a function,
     // need to evaluate whether it would be correct to analyze the aliasee
     // function for this optimization.
-    auto Fn = dyn_cast<Function>(Target.Fn);
+    auto *Fn = dyn_cast<Function>(Target.Fn);
     if (!Fn)
       return false;
 
@@ -1836,11 +1855,11 @@ bool DevirtModule::tryVirtualConstProp(
   // TODO: Skip for now if the vtable symbol was an alias to a function,
   // need to evaluate whether it would be correct to analyze the aliasee
   // function for this optimization.
-  auto Fn = dyn_cast<Function>(TargetsForSlot[0].Fn);
+  auto *Fn = dyn_cast<Function>(TargetsForSlot[0].Fn);
   if (!Fn)
     return false;
   // This only works if the function returns an integer.
-  auto RetType = dyn_cast<IntegerType>(Fn->getReturnType());
+  auto *RetType = dyn_cast<IntegerType>(Fn->getReturnType());
   if (!RetType)
     return false;
   unsigned BitWidth = RetType->getBitWidth();
@@ -1871,12 +1890,12 @@ bool DevirtModule::tryVirtualConstProp(
     // TODO: Skip for now if the vtable symbol was an alias to a function,
     // need to evaluate whether it would be correct to analyze the aliasee
     // function for this optimization.
-    auto Fn = dyn_cast<Function>(Target.Fn);
+    auto *Fn = dyn_cast<Function>(Target.Fn);
     if (!Fn)
       return false;
 
     if (Fn->isDeclaration() ||
-        !computeFunctionBodyMemoryAccess(*Fn, AARGetter(*Fn))
+        !computeFunctionBodyMemoryAccess(*Fn, FAM.getResult<AAManager>(*Fn))
              .doesNotAccessMemory() ||
         Fn->arg_empty() || !Fn->arg_begin()->use_empty() ||
         Fn->getReturnType() != RetType)
@@ -1992,11 +2011,11 @@ void DevirtModule::rebuildGlobal(VTableBits &B) {
 
   // Build an anonymous global containing the before bytes, followed by the
   // original initializer, followed by the after bytes.
-  auto NewInit = ConstantStruct::getAnon(
+  auto *NewInit = ConstantStruct::getAnon(
       {ConstantDataArray::get(M.getContext(), B.Before.Bytes),
        B.GV->getInitializer(),
        ConstantDataArray::get(M.getContext(), B.After.Bytes)});
-  auto NewGV =
+  auto *NewGV =
       new GlobalVariable(M, NewInit->getType(), B.GV->isConstant(),
                          GlobalVariable::PrivateLinkage, NewInit, "", B.GV);
   NewGV->setSection(B.GV->getSection());
@@ -2009,7 +2028,7 @@ void DevirtModule::rebuildGlobal(VTableBits &B) {
 
   // Build an alias named after the original global, pointing at the second
   // element (the original initializer).
-  auto Alias = GlobalAlias::create(
+  auto *Alias = GlobalAlias::create(
       B.GV->getInitializer()->getType(), 0, B.GV->getLinkage(), "",
       ConstantExpr::getInBoundsGetElementPtr(
           NewInit->getType(), NewGV,
@@ -2050,7 +2069,7 @@ void DevirtModule::scanTypeTestUsers(
     // Search for virtual calls based on %p and add them to DevirtCalls.
     SmallVector<DevirtCallSite, 1> DevirtCalls;
     SmallVector<CallInst *, 1> Assumes;
-    auto &DT = LookupDomTree(*CI->getFunction());
+    auto &DT = FAM.getResult<DominatorTreeAnalysis>(*CI->getFunction());
     findDevirtualizableCallsForTypeTest(DevirtCalls, Assumes, CI, DT);
 
     Metadata *TypeId =
@@ -2127,7 +2146,7 @@ void DevirtModule::scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc) {
     SmallVector<Instruction *, 1> LoadedPtrs;
     SmallVector<Instruction *, 1> Preds;
     bool HasNonCallUses = false;
-    auto &DT = LookupDomTree(*CI->getFunction());
+    auto &DT = FAM.getResult<DominatorTreeAnalysis>(*CI->getFunction());
     findDevirtualizableCallsForTypeCheckedLoad(DevirtCalls, LoadedPtrs, Preds,
                                                HasNonCallUses, CI, DT);
 
@@ -2259,18 +2278,18 @@ void DevirtModule::importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo) {
   if (Res.TheKind == WholeProgramDevirtResolution::BranchFunnel) {
     // The type of the function is irrelevant, because it's bitcast at calls
     // anyhow.
-    Constant *JT = cast<Constant>(
+    auto *JT = cast<Function>(
         M.getOrInsertFunction(getGlobalName(Slot, {}, "branch_funnel"),
                               Type::getVoidTy(M.getContext()))
             .getCallee());
     bool IsExported = false;
-    applyICallBranchFunnel(SlotInfo, JT, IsExported);
+    applyICallBranchFunnel(SlotInfo, *JT, IsExported);
     assert(!IsExported);
   }
 }
 
 void DevirtModule::removeRedundantTypeTests() {
-  auto True = ConstantInt::getTrue(M.getContext());
+  auto *True = ConstantInt::getTrue(M.getContext());
   for (auto &&U : NumUnsafeUsesForTypeTest) {
     if (U.second == 0) {
       U.first->replaceAllUsesWith(True);
@@ -2490,18 +2509,17 @@ bool DevirtModule::run() {
     // Generate remarks for each devirtualized function.
     for (const auto &DT : DevirtTargets) {
       GlobalValue *GV = DT.second;
-      auto F = dyn_cast<Function>(GV);
+      auto *F = dyn_cast<Function>(GV);
       if (!F) {
-        auto A = dyn_cast<GlobalAlias>(GV);
+        auto *A = dyn_cast<GlobalAlias>(GV);
         assert(A && isa<Function>(A->getAliasee()));
         F = dyn_cast<Function>(A->getAliasee());
         assert(F);
       }
 
       using namespace ore;
-      OREGetter(F).emit(OptimizationRemark(DEBUG_TYPE, "Devirtualized", F)
-                        << "devirtualized "
-                        << NV("FunctionName", DT.first));
+      OREGetter(*F).emit(OptimizationRemark(DEBUG_TYPE, "Devirtualized", F)
+                         << "devirtualized " << NV("FunctionName", DT.first));
     }
   }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index d934638c15e7..f9155cc66031 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -2115,6 +2115,7 @@ CommonPointerBase CommonPointerBase::compute(Value *LHS, Value *RHS) {
   }
 
   // Find common base and collect RHS GEPs.
+  bool First = true;
   while (true) {
     if (Ptrs.contains(RHS)) {
       Base.Ptr = RHS;
@@ -2123,7 +2124,12 @@ CommonPointerBase CommonPointerBase::compute(Value *LHS, Value *RHS) {
 
     if (auto *GEP = dyn_cast<GEPOperator>(RHS)) {
       Base.RHSGEPs.push_back(GEP);
-      Base.RHSNW &= GEP->getNoWrapFlags();
+      if (First) {
+        First = false;
+        Base.RHSNW = GEP->getNoWrapFlags();
+      } else {
+        Base.RHSNW = Base.RHSNW.intersectForOffsetAdd(GEP->getNoWrapFlags());
+      }
       RHS = GEP->getPointerOperand();
     } else {
       // No common base.
@@ -2132,13 +2138,19 @@ CommonPointerBase CommonPointerBase::compute(Value *LHS, Value *RHS) {
   }
 
   // Collect LHS GEPs.
+  First = true;
   while (true) {
     if (LHS == Base.Ptr)
       break;
 
     auto *GEP = cast<GEPOperator>(LHS);
     Base.LHSGEPs.push_back(GEP);
-    Base.LHSNW &= GEP->getNoWrapFlags();
+    if (First) {
+      First = false;
+      Base.LHSNW = GEP->getNoWrapFlags();
+    } else {
+      Base.LHSNW = Base.LHSNW.intersectForOffsetAdd(GEP->getNoWrapFlags());
+    }
     LHS = GEP->getPointerOperand();
   }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index a13d3ceb6132..2d7524e8018b 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1799,16 +1799,21 @@ static Instruction *foldLogicCastConstant(BinaryOperator &Logic, CastInst *Cast,
   // type may provide more information to later folds, and the smaller logic
   // instruction may be cheaper (particularly in the case of vectors).
   Value *X;
+  auto &DL = IC.getDataLayout();
   if (match(Cast, m_OneUse(m_ZExt(m_Value(X))))) {
-    if (Constant *TruncC = IC.getLosslessUnsignedTrunc(C, SrcTy)) {
+    PreservedCastFlags Flags;
+    if (Constant *TruncC = getLosslessUnsignedTrunc(C, SrcTy, DL, &Flags)) {
       // LogicOpc (zext X), C --> zext (LogicOpc X, C)
       Value *NewOp = IC.Builder.CreateBinOp(LogicOpc, X, TruncC);
-      return new ZExtInst(NewOp, DestTy);
+      auto *ZExt = new ZExtInst(NewOp, DestTy);
+      ZExt->setNonNeg(Flags.NNeg);
+      ZExt->andIRFlags(Cast);
+      return ZExt;
     }
   }
 
   if (match(Cast, m_OneUse(m_SExtLike(m_Value(X))))) {
-    if (Constant *TruncC = IC.getLosslessSignedTrunc(C, SrcTy)) {
+    if (Constant *TruncC = getLosslessSignedTrunc(C, SrcTy, DL)) {
       // LogicOpc (sext X), C --> sext (LogicOpc X, C)
       Value *NewOp = IC.Builder.CreateBinOp(LogicOpc, X, TruncC);
       return new SExtInst(NewOp, DestTy);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 42b65dde6725..33b66aeaffe6 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1956,7 +1956,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     Constant *C;
     if (match(I0, m_ZExt(m_Value(X))) && match(I1, m_Constant(C)) &&
         I0->hasOneUse()) {
-      if (Constant *NarrowC = getLosslessUnsignedTrunc(C, X->getType())) {
+      if (Constant *NarrowC = getLosslessUnsignedTrunc(C, X->getType(), DL)) {
         Value *NarrowMaxMin = Builder.CreateBinaryIntrinsic(IID, X, NarrowC);
         return CastInst::Create(Instruction::ZExt, NarrowMaxMin, II->getType());
       }
@@ -2006,7 +2006,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     Constant *C;
     if (match(I0, m_SExt(m_Value(X))) && match(I1, m_Constant(C)) &&
         I0->hasOneUse()) {
-      if (Constant *NarrowC = getLosslessSignedTrunc(C, X->getType())) {
+      if (Constant *NarrowC = getLosslessSignedTrunc(C, X->getType(), DL)) {
         Value *NarrowMaxMin = Builder.CreateBinaryIntrinsic(IID, X, NarrowC);
         return CastInst::Create(Instruction::SExt, NarrowMaxMin, II->getType());
       }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index fdef49e310f8..ccf918f0b6db 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -11,11 +11,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombineInternal.h"
+#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Value.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include <optional>
@@ -969,6 +971,25 @@ Instruction *InstCombinerImpl::visitTrunc(TruncInst &Trunc) {
     Changed = true;
   }
 
+  const APInt *C1;
+  Value *V1;
+  // OP = { lshr, ashr }
+  // trunc ( OP i8 C1, V1) to i1 -> icmp eq V1, log_2(C1) iff C1 is power of 2
+  if (DestWidth == 1 && match(Src, m_Shr(m_Power2(C1), m_Value(V1)))) {
+    Value *Right = ConstantInt::get(V1->getType(), C1->countr_zero());
+    Value *Icmp = Builder.CreateICmpEQ(V1, Right);
+    return replaceInstUsesWith(Trunc, Icmp);
+  }
+
+  // OP = { lshr, ashr }
+  // trunc ( OP i8 C1, V1) to i1 -> icmp ult V1, log_2(C1 + 1) iff (C1 + 1) is
+  // power of 2
+  if (DestWidth == 1 && match(Src, m_Shr(m_LowBitMask(C1), m_Value(V1)))) {
+    Value *Right = ConstantInt::get(V1->getType(), C1->countr_one());
+    Value *Icmp = Builder.CreateICmpULT(V1, Right);
+    return replaceInstUsesWith(Trunc, Icmp);
+  }
+
   return Changed ? &Trunc : nullptr;
 }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 3a8e04303815..99ea04816681 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Analysis/CmpInstAnalysis.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/Utils/Local.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/ConstantRange.h"
@@ -110,75 +111,41 @@ static bool isSignTest(ICmpInst::Predicate &Pred, const APInt &C) {
 /// If AndCst is non-null, then the loaded value is masked with that constant
 /// before doing the comparison. This handles cases like "A[i]&4 == 0".
 Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal(
-    LoadInst *LI, GetElementPtrInst *GEP, GlobalVariable *GV, CmpInst &ICI,
-    ConstantInt *AndCst) {
-  if (LI->isVolatile() || LI->getType() != GEP->getResultElementType() ||
-      !GV->getValueType()->isArrayTy() || !GV->isConstant() ||
+    LoadInst *LI, GetElementPtrInst *GEP, CmpInst &ICI, ConstantInt *AndCst) {
+  auto *GV = dyn_cast<GlobalVariable>(getUnderlyingObject(GEP));
+  if (LI->isVolatile() || !GV || !GV->isConstant() ||
       !GV->hasDefinitiveInitializer())
     return nullptr;
 
-  Type *GEPSrcEltTy = GEP->getSourceElementType();
-  if (GEPSrcEltTy->isArrayTy())
-    GEPSrcEltTy = GEPSrcEltTy->getArrayElementType();
-  if (GV->getValueType()->getArrayElementType() != GEPSrcEltTy)
+  Type *EltTy = LI->getType();
+  TypeSize EltSize = DL.getTypeStoreSize(EltTy);
+  if (EltSize.isScalable())
     return nullptr;
 
-  Constant *Init = GV->getInitializer();
-  if (!isa<ConstantArray>(Init) && !isa<ConstantDataArray>(Init))
+  LinearExpression Expr = decomposeLinearExpression(DL, GEP);
+  if (!Expr.Index || Expr.BasePtr != GV || Expr.Offset.getBitWidth() > 64)
     return nullptr;
 
-  uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
-  // Don't blow up on huge arrays.
-  if (ArrayElementCount > MaxArraySizeForCombine)
-    return nullptr;
+  Constant *Init = GV->getInitializer();
+  TypeSize GlobalSize = DL.getTypeAllocSize(Init->getType());
 
-  // There are many forms of this optimization we can handle, for now, just do
-  // the simple index into a single-dimensional array or elements of equal size.
-  //
-  // Require: GEP [n x i8] GV, 0, Idx {{, constant indices}}
-  //      Or: GEP i8 GV, Idx
+  Value *Idx = Expr.Index;
+  const APInt &Stride = Expr.Scale;
+  const APInt &ConstOffset = Expr.Offset;
 
-  unsigned GEPIdxOp = 1;
-  if (GEP->getSourceElementType()->isArrayTy()) {
-    GEPIdxOp = 2;
-    if (!match(GEP->getOperand(1), m_ZeroInt()))
-      return nullptr;
-  }
-  if (GEP->getNumOperands() < GEPIdxOp + 1 ||
-      isa<Constant>(GEP->getOperand(GEPIdxOp)))
+  // Allow an additional context offset, but only within the stride.
+  if (!ConstOffset.ult(Stride))
     return nullptr;
 
-  // Check that indices after the variable are constants and in-range for the
-  // type they index.  Collect the indices.  This is typically for arrays of
-  // structs.
-  SmallVector<unsigned, 4> LaterIndices;
-
-  Type *EltTy = Init->getType()->getArrayElementType();
-  for (unsigned i = GEPIdxOp + 1, e = GEP->getNumOperands(); i != e; ++i) {
-    ConstantInt *Idx = dyn_cast<ConstantInt>(GEP->getOperand(i));
-    if (!Idx)
-      return nullptr; // Variable index.
-
-    uint64_t IdxVal = Idx->getZExtValue();
-    if ((unsigned)IdxVal != IdxVal)
-      return nullptr; // Too large array index.
-
-    if (StructType *STy = dyn_cast<StructType>(EltTy))
-      EltTy = STy->getElementType(IdxVal);
-    else if (ArrayType *ATy = dyn_cast<ArrayType>(EltTy)) {
-      if (IdxVal >= ATy->getNumElements())
-        return nullptr;
-      EltTy = ATy->getElementType();
-    } else {
-      return nullptr; // Unknown type.
-    }
-
-    LaterIndices.push_back(IdxVal);
-  }
+  // Don't handle overlapping loads for now.
+  if (!Stride.uge(EltSize.getFixedValue()))
+    return nullptr;
 
-  Value *Idx = GEP->getOperand(GEPIdxOp);
-  // If the index type is non-canonical, wait for it to be canonicalized.
-  if (Idx->getType() != DL.getIndexType(GEP->getType()))
+  // Don't blow up on huge arrays.
+  uint64_t ArrayElementCount =
+      divideCeil((GlobalSize.getFixedValue() - ConstOffset.getZExtValue()),
+                 Stride.getZExtValue());
+  if (ArrayElementCount > MaxArraySizeForCombine)
     return nullptr;
 
   enum { Overdefined = -3, Undefined = -2 };
@@ -211,18 +178,12 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal(
 
   // Scan the array and see if one of our patterns matches.
   Constant *CompareRHS = cast<Constant>(ICI.getOperand(1));
-  for (unsigned i = 0, e = ArrayElementCount; i != e; ++i) {
-    Constant *Elt = Init->getAggregateElement(i);
+  APInt Offset = ConstOffset;
+  for (unsigned i = 0, e = ArrayElementCount; i != e; ++i, Offset += Stride) {
+    Constant *Elt = ConstantFoldLoadFromConst(Init, EltTy, Offset, DL);
     if (!Elt)
       return nullptr;
 
-    // If this is indexing an array of structures, get the structure element.
-    if (!LaterIndices.empty()) {
-      Elt = ConstantFoldExtractValueInstruction(Elt, LaterIndices);
-      if (!Elt)
-        return nullptr;
-    }
-
     // If the element is masked, handle it.
     if (AndCst) {
       Elt = ConstantFoldBinaryOpOperands(Instruction::And, Elt, AndCst, DL);
@@ -309,19 +270,17 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal(
   // Now that we've scanned the entire array, emit our new comparison(s).  We
   // order the state machines in complexity of the generated code.
 
-  // If inbounds keyword is not present, Idx * ElementSize can overflow.
-  // Let's assume that ElementSize is 2 and the wanted value is at offset 0.
+  // If inbounds keyword is not present, Idx * Stride can overflow.
+  // Let's assume that Stride is 2 and the wanted value is at offset 0.
   // Then, there are two possible values for Idx to match offset 0:
   // 0x00..00, 0x80..00.
   // Emitting 'icmp eq Idx, 0' isn't correct in this case because the
   // comparison is false if Idx was 0x80..00.
   // We need to erase the highest countTrailingZeros(ElementSize) bits of Idx.
-  unsigned ElementSize =
-      DL.getTypeAllocSize(Init->getType()->getArrayElementType());
   auto MaskIdx = [&](Value *Idx) {
-    if (!GEP->isInBounds() && llvm::countr_zero(ElementSize) != 0) {
+    if (!Expr.Flags.isInBounds() && Stride.countr_zero() != 0) {
       Value *Mask = Constant::getAllOnesValue(Idx->getType());
-      Mask = Builder.CreateLShr(Mask, llvm::countr_zero(ElementSize));
+      Mask = Builder.CreateLShr(Mask, Stride.countr_zero());
       Idx = Builder.CreateAnd(Idx, Mask);
     }
     return Idx;
@@ -1997,10 +1956,8 @@ Instruction *InstCombinerImpl::foldICmpAndConstant(ICmpInst &Cmp,
   if (auto *C2 = dyn_cast<ConstantInt>(Y))
     if (auto *LI = dyn_cast<LoadInst>(X))
       if (auto *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0)))
-        if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
-          if (Instruction *Res =
-                  foldCmpLoadFromIndexedGlobal(LI, GEP, GV, Cmp, C2))
-            return Res;
+        if (Instruction *Res = foldCmpLoadFromIndexedGlobal(LI, GEP, Cmp, C2))
+          return Res;
 
   if (!Cmp.isEquality())
     return nullptr;
@@ -4353,10 +4310,9 @@ Instruction *InstCombinerImpl::foldICmpInstWithConstantNotInt(ICmpInst &I) {
     // Try to optimize things like "A[i] > 4" to index computations.
     if (GetElementPtrInst *GEP =
             dyn_cast<GetElementPtrInst>(LHSI->getOperand(0)))
-      if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
-        if (Instruction *Res =
-                foldCmpLoadFromIndexedGlobal(cast<LoadInst>(LHSI), GEP, GV, I))
-          return Res;
+      if (Instruction *Res =
+              foldCmpLoadFromIndexedGlobal(cast<LoadInst>(LHSI), GEP, I))
+        return Res;
     break;
   }
 
@@ -6375,7 +6331,7 @@ Instruction *InstCombinerImpl::foldICmpWithZextOrSext(ICmpInst &ICmp) {
 
   // If a lossless truncate is possible...
   Type *SrcTy = CastOp0->getSrcTy();
-  Constant *Res = getLosslessTrunc(C, SrcTy, CastOp0->getOpcode());
+  Constant *Res = getLosslessInvCast(C, SrcTy, CastOp0->getOpcode(), DL);
   if (Res) {
     if (ICmp.isEquality())
       return new ICmpInst(ICmp.getPredicate(), X, Res);
@@ -8837,10 +8793,9 @@ Instruction *InstCombinerImpl::visitFCmpInst(FCmpInst &I) {
       break;
     case Instruction::Load:
       if (auto *GEP = dyn_cast<GetElementPtrInst>(LHSI->getOperand(0)))
-        if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
-          if (Instruction *Res = foldCmpLoadFromIndexedGlobal(
-                  cast<LoadInst>(LHSI), GEP, GV, I))
-            return Res;
+        if (Instruction *Res =
+                foldCmpLoadFromIndexedGlobal(cast<LoadInst>(LHSI), GEP, I))
+          return Res;
       break;
     case Instruction::FPTrunc:
       if (Instruction *NV = foldFCmpFpTrunc(I, *LHSI, *RHSC))
@@ -8944,14 +8899,14 @@ Instruction *InstCombinerImpl::visitFCmpInst(FCmpInst &I) {
   }
 
   {
-    Value *CanonLHS = nullptr, *CanonRHS = nullptr;
+    Value *CanonLHS = nullptr;
     match(Op0, m_Intrinsic<Intrinsic::canonicalize>(m_Value(CanonLHS)));
-    match(Op1, m_Intrinsic<Intrinsic::canonicalize>(m_Value(CanonRHS)));
-
     // (canonicalize(x) == x) => (x == x)
     if (CanonLHS == Op1)
       return new FCmpInst(Pred, Op1, Op1, "", &I);
 
+    Value *CanonRHS = nullptr;
+    match(Op1, m_Intrinsic<Intrinsic::canonicalize>(m_Value(CanonRHS)));
     // (x == canonicalize(x)) => (x == x)
     if (CanonRHS == Op0)
       return new FCmpInst(Pred, Op0, Op0, "", &I);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 2340028ce93d..7a979c16da50 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -222,23 +222,6 @@ public:
   bool fmulByZeroIsZero(Value *MulVal, FastMathFlags FMF,
                         const Instruction *CtxI) const;
 
-  Constant *getLosslessTrunc(Constant *C, Type *TruncTy, unsigned ExtOp) {
-    Constant *TruncC = ConstantExpr::getTrunc(C, TruncTy);
-    Constant *ExtTruncC =
-        ConstantFoldCastOperand(ExtOp, TruncC, C->getType(), DL);
-    if (ExtTruncC && ExtTruncC == C)
-      return TruncC;
-    return nullptr;
-  }
-
-  Constant *getLosslessUnsignedTrunc(Constant *C, Type *TruncTy) {
-    return getLosslessTrunc(C, TruncTy, Instruction::ZExt);
-  }
-
-  Constant *getLosslessSignedTrunc(Constant *C, Type *TruncTy) {
-    return getLosslessTrunc(C, TruncTy, Instruction::SExt);
-  }
-
   std::optional<std::pair<Intrinsic::ID, SmallVector<Value *, 3>>>
   convertOrOfShiftsToFunnelShift(Instruction &Or);
 
@@ -710,7 +693,7 @@ public:
   bool foldAllocaCmp(AllocaInst *Alloca);
   Instruction *foldCmpLoadFromIndexedGlobal(LoadInst *LI,
                                             GetElementPtrInst *GEP,
-                                            GlobalVariable *GV, CmpInst &ICI,
+                                            CmpInst &ICI,
                                             ConstantInt *AndCst = nullptr);
   Instruction *foldFCmpIntToFPConst(FCmpInst &I, Instruction *LHSI,
                                     Constant *RHSC);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index d7310b1c741c..a9aacc707cc2 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -1642,10 +1642,11 @@ static Instruction *narrowUDivURem(BinaryOperator &I,
   }
 
   Constant *C;
+  auto &DL = IC.getDataLayout();
   if (isa<Instruction>(N) && match(N, m_OneUse(m_ZExt(m_Value(X)))) &&
       match(D, m_Constant(C))) {
     // If the constant is the same in the smaller type, use the narrow version.
-    Constant *TruncC = IC.getLosslessUnsignedTrunc(C, X->getType());
+    Constant *TruncC = getLosslessUnsignedTrunc(C, X->getType(), DL);
     if (!TruncC)
       return nullptr;
 
@@ -1656,7 +1657,7 @@ static Instruction *narrowUDivURem(BinaryOperator &I,
   if (isa<Instruction>(D) && match(D, m_OneUse(m_ZExt(m_Value(X)))) &&
       match(N, m_Constant(C))) {
     // If the constant is the same in the smaller type, use the narrow version.
-    Constant *TruncC = IC.getLosslessUnsignedTrunc(C, X->getType());
+    Constant *TruncC = getLosslessUnsignedTrunc(C, X->getType(), DL);
     if (!TruncC)
       return nullptr;
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 6477141ab095..ed9a0be6981f 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -841,7 +841,7 @@ Instruction *InstCombinerImpl::foldPHIArgZextsIntoPHI(PHINode &Phi) {
       NumZexts++;
     } else if (auto *C = dyn_cast<Constant>(V)) {
       // Make sure that constants can fit in the new type.
-      Constant *Trunc = getLosslessUnsignedTrunc(C, NarrowType);
+      Constant *Trunc = getLosslessUnsignedTrunc(C, NarrowType, DL);
       if (!Trunc)
         return nullptr;
       NewIncoming.push_back(Trunc);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index eb4332fbc095..9467463d39c0 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1993,6 +1993,63 @@ Value *InstCombinerImpl::foldSelectWithConstOpToBinOp(ICmpInst *Cmp,
   return BinOp;
 }
 
+/// Folds:
+///   %a_sub = call @llvm.usub.sat(x, IntConst1)
+///   %b_sub = call @llvm.usub.sat(y, IntConst2)
+///   %or = or %a_sub, %b_sub
+///   %cmp = icmp eq %or, 0
+///   %sel = select %cmp, 0, MostSignificantBit
+/// into:
+///   %a_sub' = usub.sat(x, IntConst1 - MostSignificantBit)
+///   %b_sub' = usub.sat(y, IntConst2 - MostSignificantBit)
+///   %or = or %a_sub', %b_sub'
+///   %and = and %or, MostSignificantBit
+/// Likewise, for vector arguments as well.
+static Instruction *foldICmpUSubSatWithAndForMostSignificantBitCmp(
+    SelectInst &SI, ICmpInst *ICI, InstCombiner::BuilderTy &Builder) {
+  if (!SI.hasOneUse() || !ICI->hasOneUse())
+    return nullptr;
+  CmpPredicate Pred;
+  Value *A, *B;
+  const APInt *Constant1, *Constant2;
+  if (!match(SI.getCondition(),
+             m_ICmp(Pred,
+                    m_OneUse(m_Or(m_OneUse(m_Intrinsic<Intrinsic::usub_sat>(
+                                      m_Value(A), m_APInt(Constant1))),
+                                  m_OneUse(m_Intrinsic<Intrinsic::usub_sat>(
+                                      m_Value(B), m_APInt(Constant2))))),
+                    m_Zero())))
+    return nullptr;
+
+  Value *TrueVal = SI.getTrueValue();
+  Value *FalseVal = SI.getFalseValue();
+  if (!(Pred == ICmpInst::ICMP_EQ &&
+        (match(TrueVal, m_Zero()) && match(FalseVal, m_SignMask()))) ||
+      (Pred == ICmpInst::ICMP_NE &&
+       (match(TrueVal, m_SignMask()) && match(FalseVal, m_Zero()))))
+    return nullptr;
+
+  auto *Ty = A->getType();
+  unsigned BW = Constant1->getBitWidth();
+  APInt MostSignificantBit = APInt::getSignMask(BW);
+
+  // Anything over MSB is negative
+  if (Constant1->isNonNegative() || Constant2->isNonNegative())
+    return nullptr;
+
+  APInt AdjAP1 = *Constant1 - MostSignificantBit + 1;
+  APInt AdjAP2 = *Constant2 - MostSignificantBit + 1;
+
+  auto *Adj1 = ConstantInt::get(Ty, AdjAP1);
+  auto *Adj2 = ConstantInt::get(Ty, AdjAP2);
+
+  Value *NewA = Builder.CreateBinaryIntrinsic(Intrinsic::usub_sat, A, Adj1);
+  Value *NewB = Builder.CreateBinaryIntrinsic(Intrinsic::usub_sat, B, Adj2);
+  Value *Or = Builder.CreateOr(NewA, NewB);
+  Constant *MSBConst = ConstantInt::get(Ty, MostSignificantBit);
+  return BinaryOperator::CreateAnd(Or, MSBConst);
+}
+
 /// Visit a SelectInst that has an ICmpInst as its first operand.
 Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI,
                                                       ICmpInst *ICI) {
@@ -2009,6 +2066,9 @@ Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI,
   if (Instruction *NewSel =
           tryToReuseConstantFromSelectInComparison(SI, *ICI, *this))
     return NewSel;
+  if (Instruction *Folded =
+          foldICmpUSubSatWithAndForMostSignificantBitCmp(SI, ICI, Builder))
+    return Folded;
 
   // NOTE: if we wanted to, this is where to detect integer MIN/MAX
   bool Changed = false;
@@ -2315,7 +2375,7 @@ Instruction *InstCombinerImpl::foldSelectExtConst(SelectInst &Sel) {
   // If the constant is the same after truncation to the smaller type and
   // extension to the original type, we can narrow the select.
   Type *SelType = Sel.getType();
-  Constant *TruncC = getLosslessTrunc(C, SmallType, ExtOpcode);
+  Constant *TruncC = getLosslessInvCast(C, SmallType, ExtOpcode, DL);
   if (TruncC && ExtInst->hasOneUse()) {
     Value *TruncCVal = cast<Value>(TruncC);
     if (ExtInst == Sel.getFalseValue())
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index f17fecd430a6..aa030294ff1e 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -795,8 +795,7 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Instruction *I,
         I->dropPoisonGeneratingFlags();
         return I;
       }
-      Known.Zero.lshrInPlace(ShiftAmt);
-      Known.One.lshrInPlace(ShiftAmt);
+      Known >>= ShiftAmt;
       if (ShiftAmt)
         Known.Zero.setHighBits(ShiftAmt);  // high bits known zero.
     } else {
@@ -1066,10 +1065,9 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Instruction *I,
           }
         }
 
-        Known.Zero = LHSKnown.Zero.shl(ShiftAmt) |
-                     RHSKnown.Zero.lshr(BitWidth - ShiftAmt);
-        Known.One = LHSKnown.One.shl(ShiftAmt) |
-                    RHSKnown.One.lshr(BitWidth - ShiftAmt);
+        LHSKnown <<= ShiftAmt;
+        RHSKnown >>= BitWidth - ShiftAmt;
+        Known = LHSKnown.unionWith(RHSKnown);
         KnownBitsComputed = true;
         break;
       }
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 5ee3bb1abe86..c2f045a2ab02 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2027,9 +2027,7 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN,
   }
 
   if (OneUse) {
-    replaceAllDbgUsesWith(const_cast<PHINode &>(*PN),
-                          const_cast<PHINode &>(*NewPN),
-                          const_cast<PHINode &>(*PN), DT);
+    replaceAllDbgUsesWith(*PN, *NewPN, *PN, DT);
   }
   return replaceInstUsesWith(I, NewPN);
 }
@@ -2570,7 +2568,7 @@ Instruction *InstCombinerImpl::narrowMathIfNoOverflow(BinaryOperator &BO) {
     Constant *WideC;
     if (!Op0->hasOneUse() || !match(Op1, m_Constant(WideC)))
       return nullptr;
-    Constant *NarrowC = getLosslessTrunc(WideC, X->getType(), CastOpc);
+    Constant *NarrowC = getLosslessInvCast(WideC, X->getType(), CastOpc, DL);
     if (!NarrowC)
       return nullptr;
     Y = NarrowC;
@@ -2676,6 +2674,62 @@ static Instruction *canonicalizeGEPOfConstGEPI8(GetElementPtrInst &GEP,
   return nullptr;
 }
 
+/// Combine constant offsets separated by variable offsets.
+/// ptradd (ptradd (ptradd p, C1), x), C2 -> ptradd (ptradd p, x), C1+C2
+static Instruction *combineConstantOffsets(GetElementPtrInst &GEP,
+                                           InstCombinerImpl &IC) {
+  if (!GEP.hasAllConstantIndices())
+    return nullptr;
+
+  GEPNoWrapFlags NW = GEPNoWrapFlags::all();
+  SmallVector<GetElementPtrInst *> Skipped;
+  auto *InnerGEP = dyn_cast<GetElementPtrInst>(GEP.getPointerOperand());
+  while (true) {
+    if (!InnerGEP)
+      return nullptr;
+
+    NW = NW.intersectForReassociate(InnerGEP->getNoWrapFlags());
+    if (InnerGEP->hasAllConstantIndices())
+      break;
+
+    if (!InnerGEP->hasOneUse())
+      return nullptr;
+
+    Skipped.push_back(InnerGEP);
+    InnerGEP = dyn_cast<GetElementPtrInst>(InnerGEP->getPointerOperand());
+  }
+
+  // The two constant offset GEPs are directly adjacent: Let normal offset
+  // merging handle it.
+  if (Skipped.empty())
+    return nullptr;
+
+  // FIXME: This one-use check is not strictly necessary. Consider relaxing it
+  // if profitable.
+  if (!InnerGEP->hasOneUse())
+    return nullptr;
+
+  // Don't bother with vector splats.
+  Type *Ty = GEP.getType();
+  if (InnerGEP->getType() != Ty)
+    return nullptr;
+
+  const DataLayout &DL = IC.getDataLayout();
+  APInt Offset(DL.getIndexTypeSizeInBits(Ty), 0);
+  if (!GEP.accumulateConstantOffset(DL, Offset) ||
+      !InnerGEP->accumulateConstantOffset(DL, Offset))
+    return nullptr;
+
+  IC.replaceOperand(*Skipped.back(), 0, InnerGEP->getPointerOperand());
+  for (GetElementPtrInst *SkippedGEP : Skipped)
+    SkippedGEP->setNoWrapFlags(NW);
+
+  return IC.replaceInstUsesWith(
+      GEP,
+      IC.Builder.CreatePtrAdd(Skipped.front(), IC.Builder.getInt(Offset), "",
+                              NW.intersectForOffsetAdd(GEP.getNoWrapFlags())));
+}
+
 Instruction *InstCombinerImpl::visitGEPOfGEP(GetElementPtrInst &GEP,
                                              GEPOperator *Src) {
   // Combine Indices - If the source pointer to this getelementptr instruction
@@ -2687,125 +2741,56 @@ Instruction *InstCombinerImpl::visitGEPOfGEP(GetElementPtrInst &GEP,
   if (auto *I = canonicalizeGEPOfConstGEPI8(GEP, Src, *this))
     return I;
 
-  // For constant GEPs, use a more general offset-based folding approach.
-  Type *PtrTy = Src->getType()->getScalarType();
-  if (GEP.hasAllConstantIndices() &&
-      (Src->hasOneUse() || Src->hasAllConstantIndices())) {
-    // Split Src into a variable part and a constant suffix.
-    gep_type_iterator GTI = gep_type_begin(*Src);
-    Type *BaseType = GTI.getIndexedType();
-    bool IsFirstType = true;
-    unsigned NumVarIndices = 0;
-    for (auto Pair : enumerate(Src->indices())) {
-      if (!isa<ConstantInt>(Pair.value())) {
-        BaseType = GTI.getIndexedType();
-        IsFirstType = false;
-        NumVarIndices = Pair.index() + 1;
-      }
-      ++GTI;
-    }
-
-    // Determine the offset for the constant suffix of Src.
-    APInt Offset(DL.getIndexTypeSizeInBits(PtrTy), 0);
-    if (NumVarIndices != Src->getNumIndices()) {
-      // FIXME: getIndexedOffsetInType() does not handled scalable vectors.
-      if (BaseType->isScalableTy())
-        return nullptr;
-
-      SmallVector<Value *> ConstantIndices;
-      if (!IsFirstType)
-        ConstantIndices.push_back(
-            Constant::getNullValue(Type::getInt32Ty(GEP.getContext())));
-      append_range(ConstantIndices, drop_begin(Src->indices(), NumVarIndices));
-      Offset += DL.getIndexedOffsetInType(BaseType, ConstantIndices);
-    }
-
-    // Add the offset for GEP (which is fully constant).
-    if (!GEP.accumulateConstantOffset(DL, Offset))
-      return nullptr;
-
-    // Convert the total offset back into indices.
-    SmallVector<APInt> ConstIndices =
-        DL.getGEPIndicesForOffset(BaseType, Offset);
-    if (!Offset.isZero() || (!IsFirstType && !ConstIndices[0].isZero()))
-      return nullptr;
-
-    GEPNoWrapFlags NW = getMergedGEPNoWrapFlags(*Src, *cast<GEPOperator>(&GEP));
-    SmallVector<Value *> Indices(
-        drop_end(Src->indices(), Src->getNumIndices() - NumVarIndices));
-    for (const APInt &Idx : drop_begin(ConstIndices, !IsFirstType)) {
-      Indices.push_back(ConstantInt::get(GEP.getContext(), Idx));
-      // Even if the total offset is inbounds, we may end up representing it
-      // by first performing a larger negative offset, and then a smaller
-      // positive one. The large negative offset might go out of bounds. Only
-      // preserve inbounds if all signs are the same.
-      if (Idx.isNonNegative() != ConstIndices[0].isNonNegative())
-        NW = NW.withoutNoUnsignedSignedWrap();
-      if (!Idx.isNonNegative())
-        NW = NW.withoutNoUnsignedWrap();
-    }
-
-    return replaceInstUsesWith(
-        GEP, Builder.CreateGEP(Src->getSourceElementType(), Src->getOperand(0),
-                               Indices, "", NW));
-  }
+  if (auto *I = combineConstantOffsets(GEP, *this))
+    return I;
 
   if (Src->getResultElementType() != GEP.getSourceElementType())
     return nullptr;
 
-  SmallVector<Value*, 8> Indices;
-
   // Find out whether the last index in the source GEP is a sequential idx.
   bool EndsWithSequential = false;
   for (gep_type_iterator I = gep_type_begin(*Src), E = gep_type_end(*Src);
        I != E; ++I)
     EndsWithSequential = I.isSequential();
+  if (!EndsWithSequential)
+    return nullptr;
 
-  // Can we combine the two pointer arithmetics offsets?
-  if (EndsWithSequential) {
-    // Replace: gep (gep %P, long B), long A, ...
-    // With:    T = long A+B; gep %P, T, ...
-    Value *SO1 = Src->getOperand(Src->getNumOperands()-1);
-    Value *GO1 = GEP.getOperand(1);
-
-    // If they aren't the same type, then the input hasn't been processed
-    // by the loop above yet (which canonicalizes sequential index types to
-    // intptr_t).  Just avoid transforming this until the input has been
-    // normalized.
-    if (SO1->getType() != GO1->getType())
-      return nullptr;
+  // Replace: gep (gep %P, long B), long A, ...
+  // With:    T = long A+B; gep %P, T, ...
+  Value *SO1 = Src->getOperand(Src->getNumOperands() - 1);
+  Value *GO1 = GEP.getOperand(1);
 
-    Value *Sum =
-        simplifyAddInst(GO1, SO1, false, false, SQ.getWithInstruction(&GEP));
-    // Only do the combine when we are sure the cost after the
-    // merge is never more than that before the merge.
-    if (Sum == nullptr)
-      return nullptr;
+  // If they aren't the same type, then the input hasn't been processed
+  // by the loop above yet (which canonicalizes sequential index types to
+  // intptr_t).  Just avoid transforming this until the input has been
+  // normalized.
+  if (SO1->getType() != GO1->getType())
+    return nullptr;
 
-    Indices.append(Src->op_begin()+1, Src->op_end()-1);
-    Indices.push_back(Sum);
-    Indices.append(GEP.op_begin()+2, GEP.op_end());
-  } else if (isa<Constant>(*GEP.idx_begin()) &&
-             cast<Constant>(*GEP.idx_begin())->isNullValue() &&
-             Src->getNumOperands() != 1) {
-    // Otherwise we can do the fold if the first index of the GEP is a zero
-    Indices.append(Src->op_begin()+1, Src->op_end());
-    Indices.append(GEP.idx_begin()+1, GEP.idx_end());
-  }
-
-  // Don't create GEPs with more than one variable index.
-  unsigned NumVarIndices =
-      count_if(Indices, [](Value *Idx) { return !isa<Constant>(Idx); });
-  if (NumVarIndices > 1)
+  Value *Sum =
+      simplifyAddInst(GO1, SO1, false, false, SQ.getWithInstruction(&GEP));
+  // Only do the combine when we are sure the cost after the
+  // merge is never more than that before the merge.
+  if (Sum == nullptr)
     return nullptr;
 
-  if (!Indices.empty())
-    return replaceInstUsesWith(
-        GEP, Builder.CreateGEP(
-                 Src->getSourceElementType(), Src->getOperand(0), Indices, "",
-                 getMergedGEPNoWrapFlags(*Src, *cast<GEPOperator>(&GEP))));
+  SmallVector<Value *, 8> Indices;
+  Indices.append(Src->op_begin() + 1, Src->op_end() - 1);
+  Indices.push_back(Sum);
+  Indices.append(GEP.op_begin() + 2, GEP.op_end());
 
-  return nullptr;
+  // Don't create GEPs with more than one non-zero index.
+  unsigned NumNonZeroIndices = count_if(Indices, [](Value *Idx) {
+    auto *C = dyn_cast<Constant>(Idx);
+    return !C || !C->isNullValue();
+  });
+  if (NumNonZeroIndices > 1)
+    return nullptr;
+
+  return replaceInstUsesWith(
+      GEP, Builder.CreateGEP(
+               Src->getSourceElementType(), Src->getOperand(0), Indices, "",
+               getMergedGEPNoWrapFlags(*Src, *cast<GEPOperator>(&GEP))));
 }
 
 Value *InstCombiner::getFreelyInvertedImpl(Value *V, bool WillInvertAllUses,
@@ -3238,6 +3223,19 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
                                drop_end(Indices), "", GEP.getNoWrapFlags()));
   }
 
+  // Strip leading zero indices.
+  auto *FirstIdx = dyn_cast<Constant>(Indices.front());
+  if (FirstIdx && FirstIdx->isNullValue() &&
+      !FirstIdx->getType()->isVectorTy()) {
+    gep_type_iterator GTI = gep_type_begin(GEP);
+    ++GTI;
+    if (!GTI.isStruct())
+      return replaceInstUsesWith(GEP, Builder.CreateGEP(GTI.getIndexedType(),
+                                                        GEP.getPointerOperand(),
+                                                        drop_begin(Indices), "",
+                                                        GEP.getNoWrapFlags()));
+  }
+
   // Scalarize vector operands; prefer splat-of-gep.as canonical form.
   // Note that this looses information about undef lanes; we run it after
   // demanded bits to partially mitigate that loss.
@@ -3264,17 +3262,18 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     return replaceInstUsesWith(GEP, Res);
   }
 
-  bool SeenVarIndex = false;
+  bool SeenNonZeroIndex = false;
   for (auto [IdxNum, Idx] : enumerate(Indices)) {
-    if (isa<Constant>(Idx))
+    auto *C = dyn_cast<Constant>(Idx);
+    if (C && C->isNullValue())
       continue;
 
-    if (!SeenVarIndex) {
-      SeenVarIndex = true;
+    if (!SeenNonZeroIndex) {
+      SeenNonZeroIndex = true;
       continue;
     }
 
-    // GEP has multiple variable indices: Split it.
+    // GEP has multiple non-zero indices: Split it.
     ArrayRef<Value *> FrontIndices = ArrayRef(Indices).take_front(IdxNum);
     Value *FrontGEP =
         Builder.CreateGEP(GEPEltType, PtrOp, FrontIndices,
@@ -4961,63 +4960,68 @@ Instruction *InstCombinerImpl::visitLandingPadInst(LandingPadInst &LI) {
 Value *
 InstCombinerImpl::pushFreezeToPreventPoisonFromPropagating(FreezeInst &OrigFI) {
   // Try to push freeze through instructions that propagate but don't produce
-  // poison as far as possible.  If an operand of freeze follows three
-  // conditions 1) one-use, 2) does not produce poison, and 3) has all but one
-  // guaranteed-non-poison operands then push the freeze through to the one
-  // operand that is not guaranteed non-poison.  The actual transform is as
-  // follows.
-  //   Op1 = ...                        ; Op1 can be posion
-  //   Op0 = Inst(Op1, NonPoisonOps...) ; Op0 has only one use and only have
-  //                                    ; single guaranteed-non-poison operands
+  // poison as far as possible. If an operand of freeze does not produce poison
+  // then push the freeze through to the operands that are not guaranteed
+  // non-poison. The actual transform is as follows.
+  //   Op1 = ...                        ; Op1 can be poison
+  //   Op0 = Inst(Op1, NonPoisonOps...)
   //   ... = Freeze(Op0)
   // =>
   //   Op1 = ...
   //   Op1.fr = Freeze(Op1)
   //   ... = Inst(Op1.fr, NonPoisonOps...)
-  auto *OrigOp = OrigFI.getOperand(0);
-  auto *OrigOpInst = dyn_cast<Instruction>(OrigOp);
 
-  // While we could change the other users of OrigOp to use freeze(OrigOp), that
-  // potentially reduces their optimization potential, so let's only do this iff
-  // the OrigOp is only used by the freeze.
-  if (!OrigOpInst || !OrigOpInst->hasOneUse() || isa<PHINode>(OrigOp))
-    return nullptr;
+  auto CanPushFreeze = [](Value *V) {
+    if (!isa<Instruction>(V) || isa<PHINode>(V))
+      return false;
 
-  // We can't push the freeze through an instruction which can itself create
-  // poison.  If the only source of new poison is flags, we can simply
-  // strip them (since we know the only use is the freeze and nothing can
-  // benefit from them.)
-  if (canCreateUndefOrPoison(cast<Operator>(OrigOp),
-                             /*ConsiderFlagsAndMetadata*/ false))
-    return nullptr;
+    // We can't push the freeze through an instruction which can itself create
+    // poison.  If the only source of new poison is flags, we can simply
+    // strip them (since we know the only use is the freeze and nothing can
+    // benefit from them.)
+    return !canCreateUndefOrPoison(cast<Operator>(V),
+                                   /*ConsiderFlagsAndMetadata*/ false);
+  };
 
-  // If operand is guaranteed not to be poison, there is no need to add freeze
-  // to the operand. So we first find the operand that is not guaranteed to be
-  // poison.
-  Value *MaybePoisonOperand = nullptr;
-  for (Value *V : OrigOpInst->operands()) {
-    if (isa<MetadataAsValue>(V) || isGuaranteedNotToBeUndefOrPoison(V) ||
-        // Treat identical operands as a single operand.
-        (MaybePoisonOperand && MaybePoisonOperand == V))
+  // Pushing freezes up long instruction chains can be expensive. Instead,
+  // we directly push the freeze all the way to the leaves. However, we leave
+  // deduplication of freezes on the same value for freezeOtherUses().
+  Use *OrigUse = &OrigFI.getOperandUse(0);
+  SmallPtrSet<Instruction *, 8> Visited;
+  SmallVector<Use *, 8> Worklist;
+  Worklist.push_back(OrigUse);
+  while (!Worklist.empty()) {
+    auto *U = Worklist.pop_back_val();
+    Value *V = U->get();
+    if (!CanPushFreeze(V)) {
+      // If we can't push through the original instruction, abort the transform.
+      if (U == OrigUse)
+        return nullptr;
+
+      auto *UserI = cast<Instruction>(U->getUser());
+      Builder.SetInsertPoint(UserI);
+      Value *Frozen = Builder.CreateFreeze(V, V->getName() + ".fr");
+      U->set(Frozen);
       continue;
-    if (!MaybePoisonOperand)
-      MaybePoisonOperand = V;
-    else
-      return nullptr;
-  }
+    }
 
-  OrigOpInst->dropPoisonGeneratingAnnotations();
+    auto *I = cast<Instruction>(V);
+    if (!Visited.insert(I).second)
+      continue;
 
-  // If all operands are guaranteed to be non-poison, we can drop freeze.
-  if (!MaybePoisonOperand)
-    return OrigOp;
+    // reverse() to emit freezes in a more natural order.
+    for (Use &Op : reverse(I->operands())) {
+      Value *OpV = Op.get();
+      if (isa<MetadataAsValue>(OpV) || isGuaranteedNotToBeUndefOrPoison(OpV))
+        continue;
+      Worklist.push_back(&Op);
+    }
 
-  Builder.SetInsertPoint(OrigOpInst);
-  Value *FrozenMaybePoisonOperand = Builder.CreateFreeze(
-      MaybePoisonOperand, MaybePoisonOperand->getName() + ".fr");
+    I->dropPoisonGeneratingAnnotations();
+    this->Worklist.add(I);
+  }
 
-  OrigOpInst->replaceUsesOfWith(MaybePoisonOperand, FrozenMaybePoisonOperand);
-  return OrigOp;
+  return OrigUse->get();
 }
 
 Instruction *InstCombinerImpl::foldFreezeIntoRecurrence(FreezeInst &FI,
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 50258af5e26c..42c3d4a4f4c4 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1219,7 +1219,9 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
 
     std::optional<TypeSize> Size = AI->getAllocationSize(AI->getDataLayout());
     // Check that size is known and can be stored in IntptrTy.
-    if (!Size || !ConstantInt::isValueValidForType(IntptrTy, *Size))
+    // TODO: Add support for scalable vectors if possible.
+    if (!Size || Size->isScalable() ||
+        !ConstantInt::isValueValidForType(IntptrTy, *Size))
       return;
 
     bool DoPoison = (ID == Intrinsic::lifetime_end);
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 66cdbfcf998c..832592e7663b 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -212,6 +212,15 @@ static cl::opt<float>
                               "OR because of the hot percentile cutoff, if "
                               "both are supplied."));
 
+static cl::opt<bool> ClStaticLinking(
+    "hwasan-static-linking",
+    cl::desc("Don't use .note.hwasan.globals section to instrument globals "
+             "from loadable libraries. "
+             "Note: in static binaries, the global variables section can be "
+             "accessed directly via linker-provided "
+             "__start_hwasan_globals and __stop_hwasan_globals symbols"),
+    cl::Hidden, cl::init(false));
+
 STATISTIC(NumTotalFuncs, "Number of total funcs");
 STATISTIC(NumInstrumentedFuncs, "Number of instrumented funcs");
 STATISTIC(NumNoProfileSummaryFuncs, "Number of funcs without PS");
@@ -335,6 +344,7 @@ private:
                                           FunctionAnalysisManager &FAM) const;
   void initializeModule();
   void createHwasanCtorComdat();
+  void createHwasanNote();
 
   void initializeCallbacks(Module &M);
 
@@ -533,20 +543,7 @@ void HWAddressSanitizerPass::printPipeline(
   OS << '>';
 }
 
-void HWAddressSanitizer::createHwasanCtorComdat() {
-  std::tie(HwasanCtorFunction, std::ignore) =
-      getOrCreateSanitizerCtorAndInitFunctions(
-          M, kHwasanModuleCtorName, kHwasanInitName,
-          /*InitArgTypes=*/{},
-          /*InitArgs=*/{},
-          // This callback is invoked when the functions are created the first
-          // time. Hook them into the global ctors list in that case:
-          [&](Function *Ctor, FunctionCallee) {
-            Comdat *CtorComdat = M.getOrInsertComdat(kHwasanModuleCtorName);
-            Ctor->setComdat(CtorComdat);
-            appendToGlobalCtors(M, Ctor, 0, Ctor);
-          });
-
+void HWAddressSanitizer::createHwasanNote() {
   // Create a note that contains pointers to the list of global
   // descriptors. Adding a note to the output file will cause the linker to
   // create a PT_NOTE program header pointing to the note that we can use to
@@ -630,6 +627,29 @@ void HWAddressSanitizer::createHwasanCtorComdat() {
   appendToCompilerUsed(M, Dummy);
 }
 
+void HWAddressSanitizer::createHwasanCtorComdat() {
+  std::tie(HwasanCtorFunction, std::ignore) =
+      getOrCreateSanitizerCtorAndInitFunctions(
+          M, kHwasanModuleCtorName, kHwasanInitName,
+          /*InitArgTypes=*/{},
+          /*InitArgs=*/{},
+          // This callback is invoked when the functions are created the first
+          // time. Hook them into the global ctors list in that case:
+          [&](Function *Ctor, FunctionCallee) {
+            Comdat *CtorComdat = M.getOrInsertComdat(kHwasanModuleCtorName);
+            Ctor->setComdat(CtorComdat);
+            appendToGlobalCtors(M, Ctor, 0, Ctor);
+          });
+
+  // Do not create .note.hwasan.globals for static binaries, as it is only
+  // needed for instrumenting globals from dynamic libraries. In static
+  // binaries, the global variables section can be accessed directly via the
+  // __start_hwasan_globals and __stop_hwasan_globals symbols inserted by the
+  // linker.
+  if (!ClStaticLinking)
+    createHwasanNote();
+}
+
 /// Module-level initialization.
 ///
 /// inserts a call to __hwasan_init to the module's constructor list.
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
index a9a0731f16d9..ecb2f2dbc552 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
@@ -22,6 +22,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
+#include "llvm/ProfileData/DataAccessProf.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/ProfileData/InstrProfReader.h"
 #include "llvm/ProfileData/MemProfCommon.h"
@@ -75,6 +76,10 @@ static cl::opt<unsigned> MinMatchedColdBytePercent(
     "memprof-matching-cold-threshold", cl::init(100), cl::Hidden,
     cl::desc("Min percent of cold bytes matched to hint allocation cold"));
 
+static cl::opt<bool> AnnotateStaticDataSectionPrefix(
+    "memprof-annotate-static-data-prefix", cl::init(false), cl::Hidden,
+    cl::desc("If true, annotate the static data section prefix"));
+
 // Matching statistics
 STATISTIC(NumOfMemProfMissing, "Number of functions without memory profile.");
 STATISTIC(NumOfMemProfMismatch,
@@ -90,6 +95,14 @@ STATISTIC(NumOfMemProfMatchedAllocs,
           "Number of matched memory profile allocs.");
 STATISTIC(NumOfMemProfMatchedCallSites,
           "Number of matched memory profile callsites.");
+STATISTIC(NumOfMemProfHotGlobalVars,
+          "Number of global vars annotated with 'hot' section prefix.");
+STATISTIC(NumOfMemProfColdGlobalVars,
+          "Number of global vars annotated with 'unlikely' section prefix.");
+STATISTIC(NumOfMemProfUnknownGlobalVars,
+          "Number of global vars with unknown hotness (no section prefix).");
+STATISTIC(NumOfMemProfExplicitSectionGlobalVars,
+          "Number of global vars with user-specified section (not annotated).");
 
 static void addCallsiteMetadata(Instruction &I,
                                 ArrayRef<uint64_t> InlinedCallStack,
@@ -674,11 +687,12 @@ MemProfUsePass::MemProfUsePass(std::string MemoryProfileFile,
 }
 
 PreservedAnalyses MemProfUsePass::run(Module &M, ModuleAnalysisManager &AM) {
-  // Return immediately if the module doesn't contain any function.
-  if (M.empty())
+  // Return immediately if the module doesn't contain any function or global
+  // variables.
+  if (M.empty() && M.globals().empty())
     return PreservedAnalyses::all();
 
-  LLVM_DEBUG(dbgs() << "Read in memory profile:");
+  LLVM_DEBUG(dbgs() << "Read in memory profile:\n");
   auto &Ctx = M.getContext();
   auto ReaderOrErr = IndexedInstrProfReader::create(MemoryProfileFileName, *FS);
   if (Error E = ReaderOrErr.takeError()) {
@@ -703,6 +717,14 @@ PreservedAnalyses MemProfUsePass::run(Module &M, ModuleAnalysisManager &AM) {
     return PreservedAnalyses::all();
   }
 
+  const bool Changed =
+      annotateGlobalVariables(M, MemProfReader->getDataAccessProfileData());
+
+  // If the module doesn't contain any function, return after we process all
+  // global variables.
+  if (M.empty())
+    return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+
   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
 
   TargetLibraryInfo &TLI = FAM.getResult<TargetLibraryAnalysis>(*M.begin());
@@ -752,3 +774,95 @@ PreservedAnalyses MemProfUsePass::run(Module &M, ModuleAnalysisManager &AM) {
 
   return PreservedAnalyses::none();
 }
+
+// Returns true iff the global variable has custom section either by
+// __attribute__((section("name")))
+// (https://clang.llvm.org/docs/AttributeReference.html#section-declspec-allocate)
+// or #pragma clang section directives
+// (https://clang.llvm.org/docs/LanguageExtensions.html#specifying-section-names-for-global-objects-pragma-clang-section).
+static bool hasExplicitSectionName(const GlobalVariable &GVar) {
+  if (GVar.hasSection())
+    return true;
+
+  auto Attrs = GVar.getAttributes();
+  if (Attrs.hasAttribute("bss-section") || Attrs.hasAttribute("data-section") ||
+      Attrs.hasAttribute("relro-section") ||
+      Attrs.hasAttribute("rodata-section"))
+    return true;
+  return false;
+}
+
+bool MemProfUsePass::annotateGlobalVariables(
+    Module &M, const memprof::DataAccessProfData *DataAccessProf) {
+  if (!AnnotateStaticDataSectionPrefix || M.globals().empty())
+    return false;
+
+  if (!DataAccessProf) {
+    M.getContext().diagnose(DiagnosticInfoPGOProfile(
+        MemoryProfileFileName.data(),
+        StringRef("Data access profiles not found in memprof. Ignore "
+                  "-memprof-annotate-static-data-prefix."),
+        DS_Warning));
+    return false;
+  }
+
+  bool Changed = false;
+  // Iterate all global variables in the module and annotate them based on
+  // data access profiles. Note it's up to the linker to decide how to map input
+  // sections to output sections, and one conservative practice is to map
+  // unlikely-prefixed ones to unlikely output section, and map the rest
+  // (hot-prefixed or prefix-less) to the canonical output section.
+  for (GlobalVariable &GVar : M.globals()) {
+    assert(!GVar.getSectionPrefix().has_value() &&
+           "GVar shouldn't have section prefix yet");
+    if (GVar.isDeclarationForLinker())
+      continue;
+
+    if (hasExplicitSectionName(GVar)) {
+      ++NumOfMemProfExplicitSectionGlobalVars;
+      LLVM_DEBUG(dbgs() << "Global variable " << GVar.getName()
+                        << " has explicit section name. Skip annotating.\n");
+      continue;
+    }
+
+    StringRef Name = GVar.getName();
+    // Skip string literals as their mangled names don't stay stable across
+    // binary releases.
+    // TODO: Track string content hash in the profiles and compute it inside the
+    // compiler to categeorize the hotness string literals.
+    if (Name.starts_with(".str")) {
+
+      LLVM_DEBUG(dbgs() << "Skip annotating string literal " << Name << "\n");
+      continue;
+    }
+
+    // DataAccessProfRecord's get* methods will canonicalize the name under the
+    // hood before looking it up, so optimizer doesn't need to do it.
+    std::optional<DataAccessProfRecord> Record =
+        DataAccessProf->getProfileRecord(Name);
+    // Annotate a global variable as hot if it has non-zero sampled count, and
+    // annotate it as cold if it's seen in the profiled binary
+    // file but doesn't have any access sample.
+    // For logging, optimization remark emitter requires a llvm::Function, but
+    // it's not well defined how to associate a global variable with a function.
+    // So we just print out the static data section prefix in LLVM_DEBUG.
+    if (Record && Record->AccessCount > 0) {
+      ++NumOfMemProfHotGlobalVars;
+      GVar.setSectionPrefix("hot");
+      Changed = true;
+      LLVM_DEBUG(dbgs() << "Global variable " << Name
+                        << " is annotated as hot\n");
+    } else if (DataAccessProf->isKnownColdSymbol(Name)) {
+      ++NumOfMemProfColdGlobalVars;
+      GVar.setSectionPrefix("unlikely");
+      Changed = true;
+      LLVM_DEBUG(dbgs() << "Global variable " << Name
+                        << " is annotated as unlikely\n");
+    } else {
+      ++NumOfMemProfUnknownGlobalVars;
+      LLVM_DEBUG(dbgs() << "Global variable " << Name << " is not annotated\n");
+    }
+  }
+
+  return Changed;
+}
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 27292d1a66c3..9899a2aae2b1 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3263,7 +3263,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     return true;
   }
 
-  /// Heuristically instrument unknown intrinsics.
+  /// Returns whether it was able to heuristically instrument unknown
+  /// intrinsics.
   ///
   /// The main purpose of this code is to do something reasonable with all
   /// random intrinsics we might encounter, most importantly - SIMD intrinsics.
@@ -3273,7 +3274,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   ///
   /// We special-case intrinsics where this approach fails. See llvm.bswap
   /// handling as an example of that.
-  bool handleUnknownIntrinsicUnlogged(IntrinsicInst &I) {
+  bool maybeHandleUnknownIntrinsicUnlogged(IntrinsicInst &I) {
     unsigned NumArgOperands = I.arg_size();
     if (NumArgOperands == 0)
       return false;
@@ -3300,8 +3301,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     return false;
   }
 
-  bool handleUnknownIntrinsic(IntrinsicInst &I) {
-    if (handleUnknownIntrinsicUnlogged(I)) {
+  bool maybeHandleUnknownIntrinsic(IntrinsicInst &I) {
+    if (maybeHandleUnknownIntrinsicUnlogged(I)) {
       if (ClDumpHeuristicInstructions)
         dumpInst(I);
 
@@ -3860,7 +3861,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   //
   //       Three operands:
   //         <4 x i32> @llvm.x86.avx512.vpdpbusd.128
-  //                       (<4 x i32> %s, <4 x i32> %a, <4 x i32> %b)
+  //                       (<4 x i32> %s, <16 x i8> %a, <16 x i8> %b)
   //         (this is equivalent to multiply-add on %a and %b, followed by
   //          adding/"accumulating" %s. "Accumulation" stores the result in one
   //          of the source registers, but this accumulate vs. add distinction
@@ -3902,15 +3903,17 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
            ReturnType->getPrimitiveSizeInBits());
 
     if (I.arg_size() == 3) {
-      assert(ParamType == ReturnType);
-      assert(ParamType == I.getArgOperand(0)->getType());
+      [[maybe_unused]] auto *AccumulatorType =
+          cast<FixedVectorType>(I.getOperand(0)->getType());
+      assert(AccumulatorType == ReturnType);
     }
 
     FixedVectorType *ImplicitReturnType = ReturnType;
     // Step 1: instrument multiplication of corresponding vector elements
     if (EltSizeInBits) {
-      ImplicitReturnType = cast<FixedVectorType>(getMMXVectorTy(
-          EltSizeInBits * 2, ParamType->getPrimitiveSizeInBits()));
+      ImplicitReturnType = cast<FixedVectorType>(
+          getMMXVectorTy(EltSizeInBits * ReductionFactor,
+                         ParamType->getPrimitiveSizeInBits()));
       ParamType = cast<FixedVectorType>(
           getMMXVectorTy(EltSizeInBits, ParamType->getPrimitiveSizeInBits()));
 
@@ -3958,7 +3961,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
     // Step 2: instrument horizontal add
     // We don't need bit-precise horizontalReduce because we only want to check
-    // if each pair of elements is fully zero.
+    // if each pair/quad of elements is fully zero.
     // Cast to <4 x i32>.
     Value *Horizontal = IRB.CreateBitCast(And, ImplicitReturnType);
 
@@ -3968,7 +3971,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
                          Constant::getNullValue(Horizontal->getType())),
         ImplicitReturnType);
 
-    // Cast it back to the required fake return type (<1 x i64>).
+    // Cast it back to the required fake return type (if MMX: <1 x i64>; for
+    // AVX, it is already correct).
     if (EltSizeInBits)
       OutShadow = CreateShadowCast(IRB, OutShadow, getShadowTy(&I));
 
@@ -5262,7 +5266,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     handleShadowOr(I);
   }
 
-  void visitIntrinsicInst(IntrinsicInst &I) {
+  bool maybeHandleCrossPlatformIntrinsic(IntrinsicInst &I) {
     switch (I.getIntrinsicID()) {
     case Intrinsic::uadd_with_overflow:
     case Intrinsic::sadd_with_overflow:
@@ -5342,6 +5346,32 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       handleVectorReduceWithStarterIntrinsic(I);
       break;
 
+    case Intrinsic::scmp:
+    case Intrinsic::ucmp: {
+      handleShadowOr(I);
+      break;
+    }
+
+    case Intrinsic::fshl:
+    case Intrinsic::fshr:
+      handleFunnelShift(I);
+      break;
+
+    case Intrinsic::is_constant:
+      // The result of llvm.is.constant() is always defined.
+      setShadow(&I, getCleanShadow(&I));
+      setOrigin(&I, getCleanOrigin());
+      break;
+
+    default:
+      return false;
+    }
+
+    return true;
+  }
+
+  bool maybeHandleX86SIMDIntrinsic(IntrinsicInst &I) {
+    switch (I.getIntrinsicID()) {
     case Intrinsic::x86_sse_stmxcsr:
       handleStmxcsr(I);
       break;
@@ -5392,6 +5422,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       break;
     }
 
+    // Convert Packed Single Precision Floating-Point Values
+    //   to Packed Signed Doubleword Integer Values
+    //
+    // <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512
+    //                (<16 x float>, <16 x i32>, i16, i32)
+    case Intrinsic::x86_avx512_mask_cvtps2dq_512:
+      handleAVX512VectorConvertFPToInt(I, /*LastMask=*/false);
+      break;
+
     // Convert Packed Double Precision Floating-Point Values
     //   to Packed Single Precision Floating-Point Values
     case Intrinsic::x86_sse2_cvtpd2ps:
@@ -5492,23 +5531,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     case Intrinsic::x86_mmx_psrli_q:
     case Intrinsic::x86_mmx_psrai_w:
     case Intrinsic::x86_mmx_psrai_d:
-    case Intrinsic::aarch64_neon_rshrn:
-    case Intrinsic::aarch64_neon_sqrshl:
-    case Intrinsic::aarch64_neon_sqrshrn:
-    case Intrinsic::aarch64_neon_sqrshrun:
-    case Intrinsic::aarch64_neon_sqshl:
-    case Intrinsic::aarch64_neon_sqshlu:
-    case Intrinsic::aarch64_neon_sqshrn:
-    case Intrinsic::aarch64_neon_sqshrun:
-    case Intrinsic::aarch64_neon_srshl:
-    case Intrinsic::aarch64_neon_sshl:
-    case Intrinsic::aarch64_neon_uqrshl:
-    case Intrinsic::aarch64_neon_uqrshrn:
-    case Intrinsic::aarch64_neon_uqshl:
-    case Intrinsic::aarch64_neon_uqshrn:
-    case Intrinsic::aarch64_neon_urshl:
-    case Intrinsic::aarch64_neon_ushl:
-      // Not handled here: aarch64_neon_vsli (vector shift left and insert)
       handleVectorShiftIntrinsic(I, /* Variable */ false);
       break;
     case Intrinsic::x86_avx2_psllv_d:
@@ -5621,19 +5643,19 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     //
     // Multiply and Add Packed Signed and Unsigned Bytes
     //   < 4 x i32> @llvm.x86.avx512.vpdpbusd.128
-    //                  (< 4 x i32>, < 4 x i32>, < 4 x i32>)
+    //                  (< 4 x i32>, <16 x i8>, <16 x i8>)
     //   < 8 x i32> @llvm.x86.avx512.vpdpbusd.256
-    //                  (< 8 x i32>, < 8 x i32>, < 8 x i32>)
+    //                  (< 8 x i32>, <32 x i8>, <32 x i8>)
     //   <16 x i32> @llvm.x86.avx512.vpdpbusd.512
-    //                  (<16 x i32>, <16 x i32>, <16 x i32>)
+    //                  (<16 x i32>, <64 x i8>, <64 x i8>)
     //
     // Multiply and Add Unsigned and Signed Bytes With Saturation
     //   < 4 x i32> @llvm.x86.avx512.vpdpbusds.128
-    //                  (< 4 x i32>, < 4 x i32>, < 4 x i32>)
+    //                  (< 4 x i32>, <16 x i8>, <16 x i8>)
     //   < 8 x i32> @llvm.x86.avx512.vpdpbusds.256
-    //                  (< 8 x i32>, < 8 x i32>, < 8 x i32>)
+    //                  (< 8 x i32>, <32 x i8>, <32 x i8>)
     //   <16 x i32> @llvm.x86.avx512.vpdpbusds.512
-    //                  (<16 x i32>, <16 x i32>, <16 x i32>)
+    //                  (<16 x i32>, <64 x i8>, <64 x i8>)
     //
     //   < 4 x i32> @llvm.x86.avx2.vpdpbssd.128
     //                  (< 4 x i32>, < 4 x i32>, < 4 x i32>)
@@ -5652,30 +5674,30 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     //
     // These intrinsics are auto-upgraded into non-masked forms:
     //   <4 x i32> @llvm.x86.avx512.mask.vpdpbusd.128
-    //                  (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //                  (<4 x i32>, <16 x i8>, <16 x i8>, i8)
     //   <4 x i32> @llvm.x86.avx512.maskz.vpdpbusd.128
-    //                  (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //                  (<4 x i32>, <16 x i8>, <16 x i8>, i8)
     //   <8 x i32> @llvm.x86.avx512.mask.vpdpbusd.256
-    //                  (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //                  (<8 x i32>, <32 x i8>, <32 x i8>, i8)
     //   <8 x i32> @llvm.x86.avx512.maskz.vpdpbusd.256
-    //                  (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //                  (<8 x i32>, <32 x i8>, <32 x i8>, i8)
     //   <16 x i32> @llvm.x86.avx512.mask.vpdpbusd.512
-    //                  (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //                  (<16 x i32>, <64 x i8>, <64 x i8>, i16)
     //   <16 x i32> @llvm.x86.avx512.maskz.vpdpbusd.512
-    //                  (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //                  (<16 x i32>, <64 x i8>, <64 x i8>, i16)
     //
     //   <4 x i32> @llvm.x86.avx512.mask.vpdpbusds.128
-    //                  (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //                  (<4 x i32>, <16 x i8>, <16 x i8>, i8)
     //   <4 x i32> @llvm.x86.avx512.maskz.vpdpbusds.128
-    //                  (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //                  (<4 x i32>, <16 x i8>, <16 x i8>, i8)
     //   <8 x i32> @llvm.x86.avx512.mask.vpdpbusds.256
-    //                  (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //                  (<8 x i32>, <32 x i8>, <32 x i8>, i8)
     //   <8 x i32> @llvm.x86.avx512.maskz.vpdpbusds.256
-    //                  (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //                  (<8 x i32>, <32 x i8>, <32 x i8>, i8)
     //   <16 x i32> @llvm.x86.avx512.mask.vpdpbusds.512
-    //                  (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //                  (<16 x i32>, <64 x i8>, <64 x i8>, i16)
     //   <16 x i32> @llvm.x86.avx512.maskz.vpdpbusds.512
-    //                  (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //                  (<16 x i32>, <64 x i8>, <64 x i8>, i16)
     case Intrinsic::x86_avx512_vpdpbusd_128:
     case Intrinsic::x86_avx512_vpdpbusd_256:
     case Intrinsic::x86_avx512_vpdpbusd_512:
@@ -5930,7 +5952,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     case Intrinsic::x86_avx512_max_pd_512: {
       // These AVX512 variants contain the rounding mode as a trailing flag.
       // Earlier variants do not have a trailing flag and are already handled
-      // by maybeHandleSimpleNomemIntrinsic(I, 0) via handleUnknownIntrinsic.
+      // by maybeHandleSimpleNomemIntrinsic(I, 0) via
+      // maybeHandleUnknownIntrinsic.
       [[maybe_unused]] bool Success =
           maybeHandleSimpleNomemIntrinsic(I, /*trailingFlags=*/1);
       assert(Success);
@@ -5988,15 +6011,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
                                         /*trailingVerbatimArgs=*/1);
       break;
 
-    // Convert Packed Single Precision Floating-Point Values
-    //   to Packed Signed Doubleword Integer Values
-    //
-    // <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512
-    //                (<16 x float>, <16 x i32>, i16, i32)
-    case Intrinsic::x86_avx512_mask_cvtps2dq_512:
-      handleAVX512VectorConvertFPToInt(I, /*LastMask=*/false);
-      break;
-
     // AVX512 PMOV: Packed MOV, with truncation
     // Precisely handled by applying the same intrinsic to the shadow
     case Intrinsic::x86_avx512_mask_pmov_dw_512:
@@ -6074,15 +6088,33 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       handleAVXGF2P8Affine(I);
       break;
 
-    case Intrinsic::fshl:
-    case Intrinsic::fshr:
-      handleFunnelShift(I);
-      break;
+    default:
+      return false;
+    }
 
-    case Intrinsic::is_constant:
-      // The result of llvm.is.constant() is always defined.
-      setShadow(&I, getCleanShadow(&I));
-      setOrigin(&I, getCleanOrigin());
+    return true;
+  }
+
+  bool maybeHandleArmSIMDIntrinsic(IntrinsicInst &I) {
+    switch (I.getIntrinsicID()) {
+    case Intrinsic::aarch64_neon_rshrn:
+    case Intrinsic::aarch64_neon_sqrshl:
+    case Intrinsic::aarch64_neon_sqrshrn:
+    case Intrinsic::aarch64_neon_sqrshrun:
+    case Intrinsic::aarch64_neon_sqshl:
+    case Intrinsic::aarch64_neon_sqshlu:
+    case Intrinsic::aarch64_neon_sqshrn:
+    case Intrinsic::aarch64_neon_sqshrun:
+    case Intrinsic::aarch64_neon_srshl:
+    case Intrinsic::aarch64_neon_sshl:
+    case Intrinsic::aarch64_neon_uqrshl:
+    case Intrinsic::aarch64_neon_uqrshrn:
+    case Intrinsic::aarch64_neon_uqshl:
+    case Intrinsic::aarch64_neon_uqshrn:
+    case Intrinsic::aarch64_neon_urshl:
+    case Intrinsic::aarch64_neon_ushl:
+      // Not handled here: aarch64_neon_vsli (vector shift left and insert)
+      handleVectorShiftIntrinsic(I, /* Variable */ false);
       break;
 
     // TODO: handling max/min similarly to AND/OR may be more precise
@@ -6233,17 +6265,27 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       break;
     }
 
-    case Intrinsic::scmp:
-    case Intrinsic::ucmp: {
-      handleShadowOr(I);
-      break;
-    }
-
     default:
-      if (!handleUnknownIntrinsic(I))
-        visitInstruction(I);
-      break;
+      return false;
     }
+
+    return true;
+  }
+
+  void visitIntrinsicInst(IntrinsicInst &I) {
+    if (maybeHandleCrossPlatformIntrinsic(I))
+      return;
+
+    if (maybeHandleX86SIMDIntrinsic(I))
+      return;
+
+    if (maybeHandleArmSIMDIntrinsic(I))
+      return;
+
+    if (maybeHandleUnknownIntrinsic(I))
+      return;
+
+    visitInstruction(I);
   }
 
   void visitLibAtomicLoad(CallBase &CB) {
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 1ddb8ae9518f..4acc3f2d8469 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -19,9 +19,11 @@
 #include "llvm/Analysis/ConstraintSystem.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
@@ -170,10 +172,12 @@ struct State {
   DominatorTree &DT;
   LoopInfo &LI;
   ScalarEvolution &SE;
+  TargetLibraryInfo &TLI;
   SmallVector<FactOrCheck, 64> WorkList;
 
-  State(DominatorTree &DT, LoopInfo &LI, ScalarEvolution &SE)
-      : DT(DT), LI(LI), SE(SE) {}
+  State(DominatorTree &DT, LoopInfo &LI, ScalarEvolution &SE,
+        TargetLibraryInfo &TLI)
+      : DT(DT), LI(LI), SE(SE), TLI(TLI) {}
 
   /// Process block \p BB and add known facts to work-list.
   void addInfoFor(BasicBlock &BB);
@@ -1109,10 +1113,54 @@ void State::addInfoForInductions(BasicBlock &BB) {
   }
 }
 
+static bool getConstraintFromMemoryAccess(GetElementPtrInst &GEP,
+                                          uint64_t AccessSize,
+                                          CmpPredicate &Pred, Value *&A,
+                                          Value *&B, const DataLayout &DL,
+                                          const TargetLibraryInfo &TLI) {
+  auto Offset = collectOffsets(cast<GEPOperator>(GEP), DL);
+  if (!Offset.NW.hasNoUnsignedWrap())
+    return false;
+
+  if (Offset.VariableOffsets.size() != 1)
+    return false;
+
+  uint64_t BitWidth = Offset.ConstantOffset.getBitWidth();
+  auto &[Index, Scale] = Offset.VariableOffsets.front();
+  // Bail out on non-canonical GEPs.
+  if (Index->getType()->getScalarSizeInBits() != BitWidth)
+    return false;
+
+  ObjectSizeOpts Opts;
+  // Workaround for gep inbounds, ptr null, idx.
+  Opts.NullIsUnknownSize = true;
+  // Be conservative since we are not clear on whether an out of bounds access
+  // to the padding is UB or not.
+  Opts.RoundToAlign = true;
+  std::optional<TypeSize> Size =
+      getBaseObjectSize(Offset.BasePtr, DL, &TLI, Opts);
+  if (!Size || Size->isScalable())
+    return false;
+
+  // Index * Scale + ConstOffset + AccessSize <= AllocSize
+  // With nuw flag, we know that the index addition doesn't have unsigned wrap.
+  // If (AllocSize - (ConstOffset + AccessSize)) wraps around, there is no valid
+  // value for Index.
+  APInt MaxIndex = (APInt(BitWidth, Size->getFixedValue() - AccessSize,
+                          /*isSigned=*/false, /*implicitTrunc=*/true) -
+                    Offset.ConstantOffset)
+                       .udiv(Scale);
+  Pred = ICmpInst::ICMP_ULE;
+  A = Index;
+  B = ConstantInt::get(Index->getType(), MaxIndex);
+  return true;
+}
+
 void State::addInfoFor(BasicBlock &BB) {
   addInfoForInductions(BB);
+  auto &DL = BB.getDataLayout();
 
-  // True as long as long as the current instruction is guaranteed to execute.
+  // True as long as the current instruction is guaranteed to execute.
   bool GuaranteedToExecute = true;
   // Queue conditions and assumes.
   for (Instruction &I : BB) {
@@ -1127,6 +1175,38 @@ void State::addInfoFor(BasicBlock &BB) {
       continue;
     }
 
+    auto AddFactFromMemoryAccess = [&](Value *Ptr, Type *AccessType) {
+      auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+      if (!GEP)
+        return;
+      TypeSize AccessSize = DL.getTypeStoreSize(AccessType);
+      if (!AccessSize.isFixed())
+        return;
+      if (GuaranteedToExecute) {
+        CmpPredicate Pred;
+        Value *A, *B;
+        if (getConstraintFromMemoryAccess(*GEP, AccessSize.getFixedValue(),
+                                          Pred, A, B, DL, TLI)) {
+          // The memory access is guaranteed to execute when BB is entered,
+          // hence the constraint holds on entry to BB.
+          WorkList.emplace_back(FactOrCheck::getConditionFact(
+              DT.getNode(I.getParent()), Pred, A, B));
+        }
+      } else {
+        WorkList.emplace_back(
+            FactOrCheck::getInstFact(DT.getNode(I.getParent()), &I));
+      }
+    };
+
+    if (auto *LI = dyn_cast<LoadInst>(&I)) {
+      if (!LI->isVolatile())
+        AddFactFromMemoryAccess(LI->getPointerOperand(), LI->getAccessType());
+    }
+    if (auto *SI = dyn_cast<StoreInst>(&I)) {
+      if (!SI->isVolatile())
+        AddFactFromMemoryAccess(SI->getPointerOperand(), SI->getAccessType());
+    }
+
     auto *II = dyn_cast<IntrinsicInst>(&I);
     Intrinsic::ID ID = II ? II->getIntrinsicID() : Intrinsic::not_intrinsic;
     switch (ID) {
@@ -1420,7 +1500,7 @@ static std::optional<bool> checkCondition(CmpInst::Predicate Pred, Value *A,
   LLVM_DEBUG(dbgs() << "Checking " << *CheckInst << "\n");
 
   auto R = Info.getConstraintForSolving(Pred, A, B);
-  if (R.empty() || !R.isValid(Info)){
+  if (R.empty() || !R.isValid(Info)) {
     LLVM_DEBUG(dbgs() << "   failed to decompose condition\n");
     return std::nullopt;
   }
@@ -1785,12 +1865,13 @@ tryToSimplifyOverflowMath(IntrinsicInst *II, ConstraintInfo &Info,
 
 static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI,
                                  ScalarEvolution &SE,
-                                 OptimizationRemarkEmitter &ORE) {
+                                 OptimizationRemarkEmitter &ORE,
+                                 TargetLibraryInfo &TLI) {
   bool Changed = false;
   DT.updateDFSNumbers();
   SmallVector<Value *> FunctionArgs(llvm::make_pointer_range(F.args()));
   ConstraintInfo Info(F.getDataLayout(), FunctionArgs);
-  State S(DT, LI, SE);
+  State S(DT, LI, SE, TLI);
   std::unique_ptr<Module> ReproducerModule(
       DumpReproducers ? new Module(F.getName(), F.getContext()) : nullptr);
 
@@ -1960,6 +2041,26 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI,
         }
         continue;
       }
+
+      auto &DL = F.getDataLayout();
+      auto AddFactsAboutIndices = [&](Value *Ptr, Type *AccessType) {
+        CmpPredicate Pred;
+        Value *A, *B;
+        if (getConstraintFromMemoryAccess(
+                *cast<GetElementPtrInst>(Ptr),
+                DL.getTypeStoreSize(AccessType).getFixedValue(), Pred, A, B, DL,
+                TLI))
+          AddFact(Pred, A, B);
+      };
+
+      if (auto *LI = dyn_cast<LoadInst>(CB.Inst)) {
+        AddFactsAboutIndices(LI->getPointerOperand(), LI->getAccessType());
+        continue;
+      }
+      if (auto *SI = dyn_cast<StoreInst>(CB.Inst)) {
+        AddFactsAboutIndices(SI->getPointerOperand(), SI->getAccessType());
+        continue;
+      }
     }
 
     Value *A = nullptr, *B = nullptr;
@@ -2018,7 +2119,8 @@ PreservedAnalyses ConstraintEliminationPass::run(Function &F,
   auto &LI = AM.getResult<LoopAnalysis>(F);
   auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
   auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
-  if (!eliminateConstraints(F, DT, LI, SE, ORE))
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  if (!eliminateConstraints(F, DT, LI, SE, ORE, TLI))
     return PreservedAnalyses::all();
 
   PreservedAnalyses PA;
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index 434b55868c99..944b253e0f5e 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -521,7 +521,7 @@ private:
 
     Instruction *SIUse = dyn_cast<Instruction>(SI->user_back());
     // The use of the select inst should be either a phi or another select.
-    if (!SIUse && !(isa<PHINode>(SIUse) || isa<SelectInst>(SIUse)))
+    if (!SIUse || !(isa<PHINode>(SIUse) || isa<SelectInst>(SIUse)))
       return false;
 
     BasicBlock *SIBB = SI->getParent();
@@ -581,15 +581,17 @@ struct AllSwitchPaths {
     VisitedBlocks VB;
     // Get paths from the determinator BBs to SwitchPhiDefBB
     std::vector<ThreadingPath> PathsToPhiDef =
-        getPathsFromStateDefMap(StateDef, SwitchPhi, VB);
-    if (SwitchPhiDefBB == SwitchBlock) {
+        getPathsFromStateDefMap(StateDef, SwitchPhi, VB, MaxNumPaths);
+    if (SwitchPhiDefBB == SwitchBlock || PathsToPhiDef.empty()) {
       TPaths = std::move(PathsToPhiDef);
       return;
     }
 
+    assert(MaxNumPaths >= PathsToPhiDef.size() && !PathsToPhiDef.empty());
+    auto PathsLimit = MaxNumPaths / PathsToPhiDef.size();
     // Find and append paths from SwitchPhiDefBB to SwitchBlock.
     PathsType PathsToSwitchBB =
-        paths(SwitchPhiDefBB, SwitchBlock, VB, /* PathDepth = */ 1);
+        paths(SwitchPhiDefBB, SwitchBlock, VB, /* PathDepth = */ 1, PathsLimit);
     if (PathsToSwitchBB.empty())
       return;
 
@@ -610,13 +612,16 @@ private:
   typedef DenseMap<const BasicBlock *, const PHINode *> StateDefMap;
   std::vector<ThreadingPath> getPathsFromStateDefMap(StateDefMap &StateDef,
                                                      PHINode *Phi,
-                                                     VisitedBlocks &VB) {
+                                                     VisitedBlocks &VB,
+                                                     unsigned PathsLimit) {
     std::vector<ThreadingPath> Res;
     auto *PhiBB = Phi->getParent();
     VB.insert(PhiBB);
 
     VisitedBlocks UniqueBlocks;
     for (auto *IncomingBB : Phi->blocks()) {
+      if (Res.size() >= PathsLimit)
+        break;
       if (!UniqueBlocks.insert(IncomingBB).second)
         continue;
       if (!SwitchOuterLoop->contains(IncomingBB))
@@ -652,8 +657,9 @@ private:
 
       // Direct predecessor, just add to the path.
       if (IncomingPhiDefBB == IncomingBB) {
-        std::vector<ThreadingPath> PredPaths =
-            getPathsFromStateDefMap(StateDef, IncomingPhi, VB);
+        assert(PathsLimit > Res.size());
+        std::vector<ThreadingPath> PredPaths = getPathsFromStateDefMap(
+            StateDef, IncomingPhi, VB, PathsLimit - Res.size());
         for (ThreadingPath &Path : PredPaths) {
           Path.push_back(PhiBB);
           Res.push_back(std::move(Path));
@@ -666,13 +672,17 @@ private:
         continue;
 
       PathsType IntermediatePaths;
-      IntermediatePaths =
-          paths(IncomingPhiDefBB, IncomingBB, VB, /* PathDepth = */ 1);
+      assert(PathsLimit > Res.size());
+      auto InterPathLimit = PathsLimit - Res.size();
+      IntermediatePaths = paths(IncomingPhiDefBB, IncomingBB, VB,
+                                /* PathDepth = */ 1, InterPathLimit);
       if (IntermediatePaths.empty())
         continue;
 
+      assert(InterPathLimit >= IntermediatePaths.size());
+      auto PredPathLimit = InterPathLimit / IntermediatePaths.size();
       std::vector<ThreadingPath> PredPaths =
-          getPathsFromStateDefMap(StateDef, IncomingPhi, VB);
+          getPathsFromStateDefMap(StateDef, IncomingPhi, VB, PredPathLimit);
       for (const ThreadingPath &Path : PredPaths) {
         for (const PathType &IPath : IntermediatePaths) {
           ThreadingPath NewPath(Path);
@@ -687,7 +697,7 @@ private:
   }
 
   PathsType paths(BasicBlock *BB, BasicBlock *ToBB, VisitedBlocks &Visited,
-                  unsigned PathDepth) {
+                  unsigned PathDepth, unsigned PathsLimit) {
     PathsType Res;
 
     // Stop exploring paths after visiting MaxPathLength blocks
@@ -714,6 +724,8 @@ private:
     // is used to prevent a duplicate path from being generated
     SmallPtrSet<BasicBlock *, 4> Successors;
     for (BasicBlock *Succ : successors(BB)) {
+      if (Res.size() >= PathsLimit)
+        break;
       if (!Successors.insert(Succ).second)
         continue;
 
@@ -735,14 +747,12 @@ private:
       // coverage and compile time.
       if (LI->getLoopFor(Succ) != CurrLoop)
         continue;
-
-      PathsType SuccPaths = paths(Succ, ToBB, Visited, PathDepth + 1);
+      assert(PathsLimit > Res.size());
+      PathsType SuccPaths =
+          paths(Succ, ToBB, Visited, PathDepth + 1, PathsLimit - Res.size());
       for (PathType &Path : SuccPaths) {
         Path.push_front(BB);
         Res.push_back(Path);
-        if (Res.size() >= MaxNumPaths) {
-          return Res;
-        }
       }
     }
     // This block could now be visited again from a different predecessor. Note
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 4baa3b3eb824..26e17cc849bf 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -2982,7 +2982,8 @@ bool GVNPass::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
 bool GVNPass::performScalarPRE(Instruction *CurInst) {
   if (isa<AllocaInst>(CurInst) || CurInst->isTerminator() ||
       isa<PHINode>(CurInst) || CurInst->getType()->isVoidTy() ||
-      CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects())
+      CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() ||
+      CurInst->getType()->isTokenLikeTy())
     return false;
 
   // Don't do PRE on compares. The PHI would prevent CodeGenPrepare from
diff --git a/llvm/lib/Transforms/Scalar/InferAlignment.cpp b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
index e9bf59c6850a..b60b15b6c3a2 100644
--- a/llvm/lib/Transforms/Scalar/InferAlignment.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -35,8 +36,38 @@ static bool tryToImproveAlign(
       return true;
     }
   }
-  // TODO: Also handle memory intrinsics.
-  return false;
+
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
+  if (!II)
+    return false;
+
+  // TODO: Handle more memory intrinsics.
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::masked_load:
+  case Intrinsic::masked_store: {
+    int AlignOpIdx = II->getIntrinsicID() == Intrinsic::masked_load ? 1 : 2;
+    Value *PtrOp = II->getIntrinsicID() == Intrinsic::masked_load
+                       ? II->getArgOperand(0)
+                       : II->getArgOperand(1);
+    Type *Type = II->getIntrinsicID() == Intrinsic::masked_load
+                     ? II->getType()
+                     : II->getArgOperand(0)->getType();
+
+    Align OldAlign =
+        cast<ConstantInt>(II->getArgOperand(AlignOpIdx))->getAlignValue();
+    Align PrefAlign = DL.getPrefTypeAlign(Type);
+    Align NewAlign = Fn(PtrOp, OldAlign, PrefAlign);
+    if (NewAlign <= OldAlign)
+      return false;
+
+    Value *V =
+        ConstantInt::get(Type::getInt32Ty(II->getContext()), NewAlign.value());
+    II->setOperand(AlignOpIdx, V);
+    return true;
+  }
+  default:
+    return false;
+  }
 }
 
 bool inferAlignment(Function &F, AssumptionCache &AC, DominatorTree &DT) {
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index c2a737d8f9a4..c7d71eb5633e 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -1437,9 +1437,18 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) {
     // AvailablePreds vector as we go so that all of the PHI entries for this
     // predecessor use the same bitcast.
     Value *&PredV = I->second;
-    if (PredV->getType() != LoadI->getType())
+    if (PredV->getType() != LoadI->getType()) {
       PredV = CastInst::CreateBitOrPointerCast(
           PredV, LoadI->getType(), "", P->getTerminator()->getIterator());
+      // The new cast is producing the value used to replace the load
+      // instruction, so uses the load's debug location. If P does not always
+      // branch to the load BB however then the debug location must be dropped,
+      // as it is hoisted past a conditional branch.
+      DebugLoc DL = P->getTerminator()->getNumSuccessors() == 1
+                        ? LoadI->getDebugLoc()
+                        : DebugLoc::getDropped();
+      cast<CastInst>(PredV)->setDebugLoc(DL);
+    }
 
     PN->addIncoming(PredV, I->first);
   }
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 03b92d3338a9..0874b29ab7d2 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -39,6 +39,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CmpInstAnalysis.h"
+#include "llvm/Analysis/HashRecognize.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/MemoryLocation.h"
@@ -143,6 +144,14 @@ static cl::opt<bool, true>
                      cl::location(DisableLIRP::Wcslen), cl::init(false),
                      cl::ReallyHidden);
 
+bool DisableLIRP::HashRecognize;
+static cl::opt<bool, true>
+    DisableLIRPHashRecognize("disable-" DEBUG_TYPE "-hashrecognize",
+                             cl::desc("Proceed with loop idiom recognize pass, "
+                                      "but do not optimize CRC loops."),
+                             cl::location(DisableLIRP::HashRecognize),
+                             cl::init(false), cl::ReallyHidden);
+
 static cl::opt<bool> UseLIRCodeSizeHeurs(
     "use-lir-code-size-heurs",
     cl::desc("Use loop idiom recognition code size heuristics when compiling "
@@ -242,6 +251,7 @@ private:
                                   const SCEV *BECount);
   bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
                                  bool IsLoopMemset = false);
+  bool optimizeCRCLoop(const PolynomialInfo &Info);
 
   /// @}
   /// \name Noncountable Loop Idiom Handling
@@ -287,6 +297,8 @@ PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
   // but ORE cannot be preserved (see comment before the pass definition).
   OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
 
+  std::optional<PolynomialInfo> HR;
+
   LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
                          AR.MSSA, DL, ORE);
   if (!LIR.runOnLoop(&L))
@@ -335,7 +347,8 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) {
   HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
   HasMemcpy = TLI->has(LibFunc_memcpy);
 
-  if (HasMemset || HasMemsetPattern || ForceMemsetPatternIntrinsic || HasMemcpy)
+  if (HasMemset || HasMemsetPattern || ForceMemsetPatternIntrinsic ||
+      HasMemcpy || !DisableLIRP::HashRecognize)
     if (SE->hasLoopInvariantBackedgeTakenCount(L))
       return runOnCountableLoop();
 
@@ -378,6 +391,13 @@ bool LoopIdiomRecognize::runOnCountableLoop() {
 
     MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
   }
+
+  // Optimize a CRC loop if HashRecognize found one, provided we're not
+  // optimizing for size.
+  if (!DisableLIRP::HashRecognize && !ApplyCodeSizeHeuristics)
+    if (auto Res = HashRecognize(*CurLoop, *SE).getResult())
+      optimizeCRCLoop(*Res);
+
   return MadeChange;
 }
 
@@ -1514,6 +1534,160 @@ bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
   return false;
 }
 
+bool LoopIdiomRecognize::optimizeCRCLoop(const PolynomialInfo &Info) {
+  // FIXME: Hexagon has a special HexagonLoopIdiom that optimizes CRC using
+  // carry-less multiplication instructions, which is more efficient than our
+  // Sarwate table-lookup optimization. Hence, until we're able to emit
+  // target-specific instructions for Hexagon, subsuming HexagonLoopIdiom,
+  // disable the optimization for Hexagon.
+  Module &M = *CurLoop->getHeader()->getModule();
+  Triple TT(M.getTargetTriple());
+  if (TT.getArch() == Triple::hexagon)
+    return false;
+
+  // First, create a new GlobalVariable corresponding to the
+  // Sarwate-lookup-table.
+  Type *CRCTy = Info.LHS->getType();
+  unsigned CRCBW = CRCTy->getIntegerBitWidth();
+  std::array<Constant *, 256> CRCConstants;
+  transform(HashRecognize::genSarwateTable(Info.RHS, Info.ByteOrderSwapped),
+            CRCConstants.begin(),
+            [CRCTy](const APInt &E) { return ConstantInt::get(CRCTy, E); });
+  Constant *ConstArray =
+      ConstantArray::get(ArrayType::get(CRCTy, 256), CRCConstants);
+  GlobalVariable *GV =
+      new GlobalVariable(M, ConstArray->getType(), true,
+                         GlobalValue::PrivateLinkage, ConstArray, ".crctable");
+
+  PHINode *IV = CurLoop->getCanonicalInductionVariable();
+  SmallVector<PHINode *, 2> Cleanup;
+
+  // Next, mark all PHIs for removal except IV.
+  {
+    for (PHINode &PN : CurLoop->getHeader()->phis()) {
+      if (&PN == IV)
+        continue;
+      PN.replaceAllUsesWith(PoisonValue::get(PN.getType()));
+      Cleanup.push_back(&PN);
+    }
+  }
+
+  // Next, fix up the trip count.
+  {
+    unsigned NewBTC = (Info.TripCount / 8) - 1;
+    BasicBlock *LoopBlk = CurLoop->getLoopLatch();
+    BranchInst *BrInst = cast<BranchInst>(LoopBlk->getTerminator());
+    CmpPredicate ExitPred = BrInst->getSuccessor(0) == LoopBlk
+                                ? ICmpInst::Predicate::ICMP_NE
+                                : ICmpInst::Predicate::ICMP_EQ;
+    Instruction *ExitCond = CurLoop->getLatchCmpInst();
+    Value *ExitLimit = ConstantInt::get(IV->getType(), NewBTC);
+    IRBuilder<> Builder(ExitCond);
+    Value *NewExitCond =
+        Builder.CreateICmp(ExitPred, IV, ExitLimit, "exit.cond");
+    ExitCond->replaceAllUsesWith(NewExitCond);
+    deleteDeadInstruction(ExitCond);
+  }
+
+  // Finally, fill the loop with the Sarwate-table-lookup logic, and replace all
+  // uses of ComputedValue.
+  //
+  // Little-endian:
+  //   crc = (crc >> 8) ^ tbl[(iv'th byte of data) ^ (bottom byte of crc)]
+  // Big-Endian:
+  //   crc = (crc << 8) ^ tbl[(iv'th byte of data) ^ (top byte of crc)]
+  {
+    auto LoByte = [](IRBuilderBase &Builder, Value *Op, const Twine &Name) {
+      Type *OpTy = Op->getType();
+      unsigned OpBW = OpTy->getIntegerBitWidth();
+      return OpBW > 8
+                 ? Builder.CreateAnd(Op, ConstantInt::get(OpTy, 0XFF), Name)
+                 : Op;
+    };
+    auto HiIdx = [LoByte, CRCBW](IRBuilderBase &Builder, Value *Op,
+                                 const Twine &Name) {
+      Type *OpTy = Op->getType();
+
+      // When the bitwidth of the CRC mismatches the Op's bitwidth, we need to
+      // use the CRC's bitwidth as the reference for shifting right.
+      return LoByte(Builder,
+                    CRCBW > 8 ? Builder.CreateLShr(
+                                    Op, ConstantInt::get(OpTy, CRCBW - 8), Name)
+                              : Op,
+                    Name + ".lo.byte");
+    };
+
+    IRBuilder<> Builder(CurLoop->getHeader(),
+                        CurLoop->getHeader()->getFirstNonPHIIt());
+
+    // Create the CRC PHI, and initialize its incoming value to the initial
+    // value of CRC.
+    PHINode *CRCPhi = Builder.CreatePHI(CRCTy, 2, "crc");
+    CRCPhi->addIncoming(Info.LHS, CurLoop->getLoopPreheader());
+
+    // CRC is now an evolving variable, initialized to the PHI.
+    Value *CRC = CRCPhi;
+
+    // TableIndexer = ((top|bottom) byte of CRC). It is XOR'ed with (iv'th byte
+    // of LHSAux), if LHSAux is non-nullptr.
+    Value *Indexer = CRC;
+    if (Value *Data = Info.LHSAux) {
+      Type *DataTy = Data->getType();
+
+      // To index into the (iv'th byte of LHSAux), we multiply iv by 8, and we
+      // shift right by that amount, and take the lo-byte (in the little-endian
+      // case), or shift left by that amount, and take the hi-idx (in the
+      // big-endian case).
+      Value *IVBits = Builder.CreateZExtOrTrunc(
+          Builder.CreateShl(IV, 3, "iv.bits"), DataTy, "iv.indexer");
+      Value *DataIndexer =
+          Info.ByteOrderSwapped
+              ? Builder.CreateShl(Data, IVBits, "data.indexer")
+              : Builder.CreateLShr(Data, IVBits, "data.indexer");
+      Indexer = Builder.CreateXor(
+          DataIndexer,
+          Builder.CreateZExtOrTrunc(Indexer, DataTy, "crc.indexer.cast"),
+          "crc.data.indexer");
+    }
+
+    Indexer = Info.ByteOrderSwapped ? HiIdx(Builder, Indexer, "indexer.hi")
+                                    : LoByte(Builder, Indexer, "indexer.lo");
+
+    // Always index into a GEP using the index type.
+    Indexer = Builder.CreateZExt(
+        Indexer, SE->getDataLayout().getIndexType(GV->getType()),
+        "indexer.ext");
+
+    // CRCTableLd = CRCTable[(iv'th byte of data) ^ (top|bottom) byte of CRC].
+    Value *CRCTableGEP =
+        Builder.CreateInBoundsGEP(CRCTy, GV, Indexer, "tbl.ptradd");
+    Value *CRCTableLd = Builder.CreateLoad(CRCTy, CRCTableGEP, "tbl.ld");
+
+    // CRCNext = (CRC (<<|>>) 8) ^ CRCTableLd, or simply CRCTableLd in case of
+    // CRC-8.
+    Value *CRCNext = CRCTableLd;
+    if (CRCBW > 8) {
+      Value *CRCShift = Info.ByteOrderSwapped
+                            ? Builder.CreateShl(CRC, 8, "crc.be.shift")
+                            : Builder.CreateLShr(CRC, 8, "crc.le.shift");
+      CRCNext = Builder.CreateXor(CRCShift, CRCTableLd, "crc.next");
+    }
+
+    // Connect the back-edge for the loop, and RAUW the ComputedValue.
+    CRCPhi->addIncoming(CRCNext, CurLoop->getLoopLatch());
+    Info.ComputedValue->replaceUsesOutsideBlock(CRCNext,
+                                                CurLoop->getLoopLatch());
+  }
+
+  // Cleanup.
+  {
+    for (PHINode *PN : Cleanup)
+      RecursivelyDeleteDeadPHINode(PN);
+    SE->forgetLoop(CurLoop);
+  }
+  return true;
+}
+
 bool LoopIdiomRecognize::runOnNoncountableLoop() {
   LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
                     << CurLoop->getHeader()->getParent()->getName()
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index f7d2258e1c28..2bda9d83236e 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -220,6 +220,7 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
   UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze;
   UP.SCEVExpansionBudget = SCEVCheapExpansionBudget;
   UP.RuntimeUnrollMultiExit = false;
+  UP.AddAdditionalAccumulators = false;
 
   // Override with any target specific settings
   TTI.getUnrollingPreferences(L, SE, UP, &ORE);
@@ -1354,6 +1355,7 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
   ULO.Heart = getLoopConvergenceHeart(L);
   ULO.SCEVExpansionBudget = UP.SCEVExpansionBudget;
   ULO.RuntimeUnrollMultiExit = UP.RuntimeUnrollMultiExit;
+  ULO.AddAdditionalAccumulators = UP.AddAdditionalAccumulators;
   LoopUnrollResult UnrollResult = UnrollLoop(
       L, ULO, LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop, AA);
   if (UnrollResult == LoopUnrollResult::Unmodified)
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 8b9d06d7e443..8a5569743ab4 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -247,8 +247,8 @@ private:
   /// index I' according to UserChain produced by function "find".
   ///
   /// The building conceptually takes two steps:
-  /// 1) iteratively distribute s/zext towards the leaves of the expression tree
-  /// that computes I
+  /// 1) iteratively distribute sext/zext/trunc towards the leaves of the
+  /// expression tree that computes I
   /// 2) reassociate the expression tree to the form I' + C.
   ///
   /// For example, to extract the 5 from sext(a + (b + 5)), we first distribute
@@ -260,29 +260,30 @@ private:
   Value *rebuildWithoutConstOffset();
 
   /// After the first step of rebuilding the GEP index without the constant
-  /// offset, distribute s/zext to the operands of all operators in UserChain.
-  /// e.g., zext(sext(a + (b + 5)) (assuming no overflow) =>
+  /// offset, distribute sext/zext/trunc to the operands of all operators in
+  /// UserChain. e.g., zext(sext(a + (b + 5)) (assuming no overflow) =>
   /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))).
   ///
   /// The function also updates UserChain to point to new subexpressions after
-  /// distributing s/zext. e.g., the old UserChain of the above example is
-  /// 5 -> b + 5 -> a + (b + 5) -> sext(...) -> zext(sext(...)),
+  /// distributing sext/zext/trunc. e.g., the old UserChain of the above example
+  /// is
+  ///   5 -> b + 5 -> a + (b + 5) -> sext(...) -> zext(sext(...)),
   /// and the new UserChain is
-  /// zext(sext(5)) -> zext(sext(b)) + zext(sext(5)) ->
-  ///   zext(sext(a)) + (zext(sext(b)) + zext(sext(5))
+  ///   zext(sext(5)) -> zext(sext(b)) + zext(sext(5)) ->
+  ///     zext(sext(a)) + (zext(sext(b)) + zext(sext(5))
   ///
   /// \p ChainIndex The index to UserChain. ChainIndex is initially
   ///               UserChain.size() - 1, and is decremented during
   ///               the recursion.
-  Value *distributeExtsAndCloneChain(unsigned ChainIndex);
+  Value *distributeCastsAndCloneChain(unsigned ChainIndex);
 
   /// Reassociates the GEP index to the form I' + C and returns I'.
   Value *removeConstOffset(unsigned ChainIndex);
 
-  /// A helper function to apply ExtInsts, a list of s/zext, to value V.
-  /// e.g., if ExtInsts = [sext i32 to i64, zext i16 to i32], this function
+  /// A helper function to apply CastInsts, a list of sext/zext/trunc, to value
+  /// V.  e.g., if CastInsts = [sext i32 to i64, zext i16 to i32], this function
   /// returns "sext i32 (zext i16 V to i32) to i64".
-  Value *applyExts(Value *V);
+  Value *applyCasts(Value *V);
 
   /// A helper function that returns whether we can trace into the operands
   /// of binary operator BO for a constant offset.
@@ -307,8 +308,8 @@ private:
   SmallVector<User *, 8> UserChain;
 
   /// A data structure used in rebuildWithoutConstOffset. Contains all
-  /// sext/zext instructions along UserChain.
-  SmallVector<CastInst *, 16> ExtInsts;
+  /// sext/zext/trunc instructions along UserChain.
+  SmallVector<CastInst *, 16> CastInsts;
 
   /// Insertion position of cloned instructions.
   BasicBlock::iterator IP;
@@ -491,7 +492,7 @@ bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
   }
 
   Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1);
-  // Do not trace into "or" unless it is equivalent to "add".
+  // Do not trace into "or" unless it is equivalent to "add nuw nsw".
   // This is the case if the or's disjoint flag is set.
   if (BO->getOpcode() == Instruction::Or &&
       !cast<PossiblyDisjointInst>(BO)->isDisjoint())
@@ -503,8 +504,8 @@ bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
   if (ZeroExtended && !SignExtended && BO->getOpcode() == Instruction::Sub)
     return false;
 
-  // In addition, tracing into BO requires that its surrounding s/zext (if
-  // any) is distributable to both operands.
+  // In addition, tracing into BO requires that its surrounding sext/zext/trunc
+  // (if any) is distributable to both operands.
   //
   // Suppose BO = A op B.
   //  SignExtended | ZeroExtended | Distributable?
@@ -628,11 +629,11 @@ APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended,
   return ConstantOffset;
 }
 
-Value *ConstantOffsetExtractor::applyExts(Value *V) {
+Value *ConstantOffsetExtractor::applyCasts(Value *V) {
   Value *Current = V;
-  // ExtInsts is built in the use-def order. Therefore, we apply them to V
+  // CastInsts is built in the use-def order. Therefore, we apply them to V
   // in the reversed order.
-  for (CastInst *I : llvm::reverse(ExtInsts)) {
+  for (CastInst *I : llvm::reverse(CastInsts)) {
     if (Constant *C = dyn_cast<Constant>(Current)) {
       // Try to constant fold the cast.
       Current = ConstantFoldCastOperand(I->getOpcode(), C, I->getType(), DL);
@@ -640,24 +641,24 @@ Value *ConstantOffsetExtractor::applyExts(Value *V) {
         continue;
     }
 
-    Instruction *Ext = I->clone();
-    Ext->setOperand(0, Current);
+    Instruction *Cast = I->clone();
+    Cast->setOperand(0, Current);
     // In ConstantOffsetExtractor::find we do not analyze nuw/nsw for trunc, so
     // we assume that it is ok to redistribute trunc over add/sub/or. But for
     // example (add (trunc nuw A), (trunc nuw B)) is more poisonous than (trunc
     // nuw (add A, B))). To make such redistributions legal we drop all the
     // poison generating flags from cloned trunc instructions here.
-    if (isa<TruncInst>(Ext))
-      Ext->dropPoisonGeneratingFlags();
-    Ext->insertBefore(*IP->getParent(), IP);
-    Current = Ext;
+    if (isa<TruncInst>(Cast))
+      Cast->dropPoisonGeneratingFlags();
+    Cast->insertBefore(*IP->getParent(), IP);
+    Current = Cast;
   }
   return Current;
 }
 
 Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() {
-  distributeExtsAndCloneChain(UserChain.size() - 1);
-  // Remove all nullptrs (used to be s/zext) from UserChain.
+  distributeCastsAndCloneChain(UserChain.size() - 1);
+  // Remove all nullptrs (used to be sext/zext/trunc) from UserChain.
   unsigned NewSize = 0;
   for (User *I : UserChain) {
     if (I != nullptr) {
@@ -670,29 +671,29 @@ Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() {
 }
 
 Value *
-ConstantOffsetExtractor::distributeExtsAndCloneChain(unsigned ChainIndex) {
+ConstantOffsetExtractor::distributeCastsAndCloneChain(unsigned ChainIndex) {
   User *U = UserChain[ChainIndex];
   if (ChainIndex == 0) {
     assert(isa<ConstantInt>(U));
-    // If U is a ConstantInt, applyExts will return a ConstantInt as well.
-    return UserChain[ChainIndex] = cast<ConstantInt>(applyExts(U));
+    // If U is a ConstantInt, applyCasts will return a ConstantInt as well.
+    return UserChain[ChainIndex] = cast<ConstantInt>(applyCasts(U));
   }
 
   if (CastInst *Cast = dyn_cast<CastInst>(U)) {
     assert(
         (isa<SExtInst>(Cast) || isa<ZExtInst>(Cast) || isa<TruncInst>(Cast)) &&
         "Only following instructions can be traced: sext, zext & trunc");
-    ExtInsts.push_back(Cast);
+    CastInsts.push_back(Cast);
     UserChain[ChainIndex] = nullptr;
-    return distributeExtsAndCloneChain(ChainIndex - 1);
+    return distributeCastsAndCloneChain(ChainIndex - 1);
   }
 
   // Function find only trace into BinaryOperator and CastInst.
   BinaryOperator *BO = cast<BinaryOperator>(U);
   // OpNo = which operand of BO is UserChain[ChainIndex - 1]
   unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1);
-  Value *TheOther = applyExts(BO->getOperand(1 - OpNo));
-  Value *NextInChain = distributeExtsAndCloneChain(ChainIndex - 1);
+  Value *TheOther = applyCasts(BO->getOperand(1 - OpNo));
+  Value *NextInChain = distributeCastsAndCloneChain(ChainIndex - 1);
 
   BinaryOperator *NewBO = nullptr;
   if (OpNo == 0) {
@@ -713,7 +714,7 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
 
   BinaryOperator *BO = cast<BinaryOperator>(UserChain[ChainIndex]);
   assert((BO->use_empty() || BO->hasOneUse()) &&
-         "distributeExtsAndCloneChain clones each BinaryOperator in "
+         "distributeCastsAndCloneChain clones each BinaryOperator in "
          "UserChain, so no one should be used more than "
          "once");
 
@@ -847,7 +848,8 @@ static bool allowsPreservingNUW(const User *U) {
   // "add nuw trunc(a), trunc(b)" is more poisonous than "trunc(add nuw a, b)"
   if (const TruncInst *TI = dyn_cast<TruncInst>(U))
     return TI->hasNoUnsignedWrap();
-  return isa<CastInst>(U) || isa<ConstantInt>(U);
+  assert((isa<CastInst>(U) || isa<ConstantInt>(U)) && "Unexpected User.");
+  return true;
 }
 
 Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP,
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 9b40fc03da6b..e4ba70d1bce1 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -98,6 +98,9 @@ static cl::opt<bool> EnableUnswitchCostMultiplier(
 static cl::opt<int> UnswitchSiblingsToplevelDiv(
     "unswitch-siblings-toplevel-div", cl::init(2), cl::Hidden,
     cl::desc("Toplevel siblings divisor for cost multiplier."));
+static cl::opt<int> UnswitchParentBlocksDiv(
+    "unswitch-parent-blocks-div", cl::init(8), cl::Hidden,
+    cl::desc("Outer loop size divisor for cost multiplier."));
 static cl::opt<int> UnswitchNumInitialUnscaledCandidates(
     "unswitch-num-initial-unscaled-candidates", cl::init(8), cl::Hidden,
     cl::desc("Number of unswitch candidates that are ignored when calculating "
@@ -2809,9 +2812,9 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L,
 }
 
 /// Cost multiplier is a way to limit potentially exponential behavior
-/// of loop-unswitch. Cost is multipied in proportion of 2^number of unswitch
-/// candidates available. Also accounting for the number of "sibling" loops with
-/// the idea to account for previous unswitches that already happened on this
+/// of loop-unswitch. Cost is multiplied in proportion of 2^number of unswitch
+/// candidates available. Also consider the number of "sibling" loops with
+/// the idea of accounting for previous unswitches that already happened on this
 /// cluster of loops. There was an attempt to keep this formula simple,
 /// just enough to limit the worst case behavior. Even if it is not that simple
 /// now it is still not an attempt to provide a detailed heuristic size
@@ -2842,7 +2845,19 @@ static int CalculateUnswitchCostMultiplier(
     return 1;
   }
 
+  // Each invariant non-trivial condition, after being unswitched, is supposed
+  // to have its own specialized sibling loop (the invariant condition has been
+  // hoisted out of the child loop into a newly-cloned loop). When unswitching
+  // conditions in nested loops, the basic block size of the outer loop should
+  // not be altered. If such a size significantly increases across unswitching
+  // invocations, something may be wrong; so adjust the final cost taking this
+  // into account.
   auto *ParentL = L.getParentLoop();
+  int ParentLoopSizeMultiplier = 1;
+  if (ParentL)
+    ParentLoopSizeMultiplier =
+        std::max<int>(ParentL->getNumBlocks() / UnswitchParentBlocksDiv, 1);
+
   int SiblingsCount = (ParentL ? ParentL->getSubLoopsVector().size()
                                : std::distance(LI.begin(), LI.end()));
   // Count amount of clones that all the candidates might cause during
@@ -2887,14 +2902,16 @@ static int CalculateUnswitchCostMultiplier(
   // at an upper bound.
   int CostMultiplier;
   if (ClonesPower > Log2_32(UnswitchThreshold) ||
-      SiblingsMultiplier > UnswitchThreshold)
+      SiblingsMultiplier > UnswitchThreshold ||
+      ParentLoopSizeMultiplier > UnswitchThreshold)
     CostMultiplier = UnswitchThreshold;
   else
     CostMultiplier = std::min(SiblingsMultiplier * (1 << ClonesPower),
                               (int)UnswitchThreshold);
 
   LLVM_DEBUG(dbgs() << "  Computed multiplier  " << CostMultiplier
-                    << " (siblings " << SiblingsMultiplier << " * clones "
+                    << " (siblings " << SiblingsMultiplier << " * parent size "
+                    << ParentLoopSizeMultiplier << " * clones "
                     << (1 << ClonesPower) << ")"
                     << " for unswitch candidate: " << TI << "\n");
   return CostMultiplier;
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index bb7dbc2980f5..e05625344ee2 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -997,7 +997,8 @@ void StructurizeCFG::simplifyHoistedPhis() {
         continue;
 
       OtherPhi->setIncomingValue(PoisonValBBIdx, V);
-      Phi->setIncomingValue(i, OtherV);
+      if (DT->dominates(OtherV, Phi))
+        Phi->setIncomingValue(i, OtherV);
     }
   }
 }
diff --git a/llvm/lib/Transforms/Utils/CodeLayout.cpp b/llvm/lib/Transforms/Utils/CodeLayout.cpp
index c76b3afef50c..27b13eeaf4d7 100644
--- a/llvm/lib/Transforms/Utils/CodeLayout.cpp
+++ b/llvm/lib/Transforms/Utils/CodeLayout.cpp
@@ -1285,7 +1285,7 @@ private:
     // Cache misses on the merged chain
     double MergedCounts = ChainPred->ExecutionCount + ChainSucc->ExecutionCount;
     double MergedSize = ChainPred->Size + ChainSucc->Size;
-    double MergedDensity = static_cast<double>(MergedCounts) / MergedSize;
+    double MergedDensity = MergedCounts / MergedSize;
     double NewScore = MergedCounts * missProbability(MergedDensity);
 
     return CurScore - NewScore;
diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp
index 7063cde5263b..5a09b7385f2b 100644
--- a/llvm/lib/Transforms/Utils/Debugify.cpp
+++ b/llvm/lib/Transforms/Utils/Debugify.cpp
@@ -254,7 +254,6 @@ bool llvm::applyDebugifyMetadata(
     }
     if (ApplyToMF)
       ApplyToMF(DIB, F);
-    DIB.finalizeSubprogram(SP);
   }
   DIB.finalize();
 
diff --git a/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp b/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
index 3bbe875bbe9e..1a9e16be6989 100644
--- a/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
+++ b/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
@@ -13,6 +13,8 @@
 
 #include "llvm/Transforms/Utils/FunctionImportUtils.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/TimeProfiler.h"
+
 using namespace llvm;
 
 /// Uses the "source_filename" instead of a Module hash ID for the suffix of
@@ -370,6 +372,7 @@ void FunctionImportGlobalProcessing::run() { processGlobalsForThinLTO(); }
 void llvm::renameModuleForThinLTO(Module &M, const ModuleSummaryIndex &Index,
                                   bool ClearDSOLocalOnDeclarations,
                                   SetVector<GlobalValue *> *GlobalsToImport) {
+  llvm::TimeTraceScope timeScope("Rename module for ThinLTO");
   FunctionImportGlobalProcessing ThinLTOProcessing(M, Index, GlobalsToImport,
                                                    ClearDSOLocalOnDeclarations);
   ThinLTOProcessing.run();
diff --git a/llvm/lib/Transforms/Utils/IRNormalizer.cpp b/llvm/lib/Transforms/Utils/IRNormalizer.cpp
index ad91318ae474..fefa49f68c8d 100644
--- a/llvm/lib/Transforms/Utils/IRNormalizer.cpp
+++ b/llvm/lib/Transforms/Utils/IRNormalizer.cpp
@@ -427,7 +427,7 @@ void IRNormalizer::reorderInstructions(Function &F) const {
       // Process the remaining instructions.
       //
       // TODO: Do more a intelligent sorting of these instructions. For example,
-      // seperate between dead instructinos and instructions used in another
+      // separate between dead instructinos and instructions used in another
       // block. Use properties of the CFG the order instructions that are used
       // in another block.
       if (Visited.contains(&I))
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index ac344904f90f..2cfd70a1746c 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -3397,8 +3397,8 @@ DIExpression *llvm::getExpressionForConstant(DIBuilder &DIB, const Constant &C,
   if (FP && Ty.isFloatingPointTy() && Ty.getScalarSizeInBits() <= 64) {
     const APFloat &APF = FP->getValueAPF();
     APInt const &API = APF.bitcastToAPInt();
-    if (auto Temp = API.getZExtValue())
-      return DIB.createConstantValueExpression(static_cast<uint64_t>(Temp));
+    if (uint64_t Temp = API.getZExtValue())
+      return DIB.createConstantValueExpression(Temp);
     return DIB.createConstantValueExpression(*API.getRawData());
   }
 
@@ -3838,8 +3838,8 @@ void llvm::maybeMarkSanitizerLibraryCallNoBuiltin(
 
 bool llvm::canReplaceOperandWithVariable(const Instruction *I, unsigned OpIdx) {
   const auto *Op = I->getOperand(OpIdx);
-  // We can't have a PHI with a metadata type.
-  if (Op->getType()->isMetadataTy())
+  // We can't have a PHI with a metadata or token type.
+  if (Op->getType()->isMetadataTy() || Op->getType()->isTokenLikeTy())
     return false;
 
   // swifterror pointers can only be used by a load, store, or as a swifterror
diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index ba0ac01cadd8..735bad1cb134 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -225,9 +225,9 @@ protected:
 
   // Auxiliary function to calculate the number of iterations for a comparison
   // instruction or a binary operator.
-  PeelCounter mergeTwoCounter(const Instruction &CmpOrBinaryOp,
-                              const PeelCounterValue &LHS,
-                              const PeelCounterValue &RHS) const;
+  PeelCounter mergeTwoCounters(const Instruction &CmpOrBinaryOp,
+                               const PeelCounterValue &LHS,
+                               const PeelCounterValue &RHS) const;
 
   // Returns true if the \p Phi is an induction in the target loop. This is a
   // lightweight check and possible to detect an IV in some cases.
@@ -269,15 +269,13 @@ bool PhiAnalyzer::isInductionPHI(const PHINode *Phi) const {
       break;
 
     // Avoid infinite loop.
-    if (Visited.contains(Cur))
+    if (!Visited.insert(Cur).second)
       return false;
 
     auto *I = dyn_cast<Instruction>(Cur);
     if (!I || !L.contains(I))
       return false;
 
-    Visited.insert(Cur);
-
     if (auto *Cast = dyn_cast<CastInst>(I)) {
       Cur = Cast->getOperand(0);
     } else if (auto *BinOp = dyn_cast<BinaryOperator>(I)) {
@@ -300,14 +298,14 @@ bool PhiAnalyzer::isInductionPHI(const PHINode *Phi) const {
 
 /// When either \p LHS or \p RHS is an IV, the result of \p CmpOrBinaryOp is
 /// considered an IV only if it is an addition or a subtraction. Otherwise the
-/// result can be a value that is neither an loop-invariant nor an IV.
+/// result can be a value that is neither a loop-invariant nor an IV.
 ///
 /// If both \p LHS and \p RHS are loop-invariants, then the result of
 /// \CmpOrBinaryOp is also a loop-invariant.
 PhiAnalyzer::PeelCounter
-PhiAnalyzer::mergeTwoCounter(const Instruction &CmpOrBinaryOp,
-                             const PeelCounterValue &LHS,
-                             const PeelCounterValue &RHS) const {
+PhiAnalyzer::mergeTwoCounters(const Instruction &CmpOrBinaryOp,
+                              const PeelCounterValue &LHS,
+                              const PeelCounterValue &RHS) const {
   auto &[LVal, LTy] = LHS;
   auto &[RVal, RTy] = RHS;
   unsigned NewVal = std::max(LVal, RVal);
@@ -380,7 +378,7 @@ PhiAnalyzer::PeelCounter PhiAnalyzer::calculate(const Value &V) {
       if (RHS == Unknown)
         return Unknown;
       return (IterationsToInvarianceOrInduction[I] =
-                  mergeTwoCounter(*I, *LHS, *RHS));
+                  mergeTwoCounters(*I, *LHS, *RHS));
     }
     if (I->isCast())
       // Cast instructions get the value of the operand.
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 86b268de43cf..b18aceaa67d7 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -41,6 +41,7 @@
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -108,6 +109,9 @@ UnrollVerifyLoopInfo("unroll-verify-loopinfo", cl::Hidden,
 #endif
                     );
 
+static cl::opt<bool> UnrollAddParallelReductions(
+    "unroll-add-parallel-reductions", cl::init(false), cl::Hidden,
+    cl::desc("Allow unrolling to add parallel reduction phis."));
 
 /// Check if unrolling created a situation where we need to insert phi nodes to
 /// preserve LCSSA form.
@@ -660,6 +664,41 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
     OrigPHINode.push_back(cast<PHINode>(I));
   }
 
+  // Collect phi nodes for reductions for which we can introduce multiple
+  // parallel reduction phis and compute the final reduction result after the
+  // loop. This requires a single exit block after unrolling. This is ensured by
+  // restricting to single-block loops where the unrolled iterations are known
+  // to not exit.
+  DenseMap<PHINode *, RecurrenceDescriptor> Reductions;
+  bool CanAddAdditionalAccumulators =
+      (UnrollAddParallelReductions.getNumOccurrences() > 0
+           ? UnrollAddParallelReductions
+           : ULO.AddAdditionalAccumulators) &&
+      !CompletelyUnroll && L->getNumBlocks() == 1 &&
+      (ULO.Runtime ||
+       (ExitInfos.contains(Header) && ((ExitInfos[Header].TripCount != 0 &&
+                                        ExitInfos[Header].BreakoutTrip == 0))));
+
+  // Limit parallelizing reductions to unroll counts of 4 or less for now.
+  // TODO: The number of parallel reductions should depend on the number of
+  // execution units. We also don't have to add a parallel reduction phi per
+  // unrolled iteration, but could for example add a parallel phi for every 2
+  // unrolled iterations.
+  if (CanAddAdditionalAccumulators && ULO.Count <= 4) {
+    for (PHINode &Phi : Header->phis()) {
+      auto RdxDesc = canParallelizeReductionWhenUnrolling(Phi, L, SE);
+      if (!RdxDesc)
+        continue;
+
+      // Only handle duplicate phis for a single reduction for now.
+      // TODO: Handle any number of reductions
+      if (!Reductions.empty())
+        continue;
+
+      Reductions[&Phi] = *RdxDesc;
+    }
+  }
+
   std::vector<BasicBlock *> Headers;
   std::vector<BasicBlock *> Latches;
   Headers.push_back(Header);
@@ -710,6 +749,7 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
   // latch.  This is a reasonable default placement if we don't have block
   // frequencies, and if we do, well the layout will be adjusted later.
   auto BlockInsertPt = std::next(LatchBlock->getIterator());
+  SmallVector<Instruction *> PartialReductions;
   for (unsigned It = 1; It != ULO.Count; ++It) {
     SmallVector<BasicBlock *, 8> NewBlocks;
     SmallDenseMap<const Loop *, Loop *, 4> NewLoops;
@@ -733,6 +773,31 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
         for (PHINode *OrigPHI : OrigPHINode) {
           PHINode *NewPHI = cast<PHINode>(VMap[OrigPHI]);
           Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock);
+
+          // Use cloned phis as parallel phis for partial reductions, which will
+          // get combined to the final reduction result after the loop.
+          if (Reductions.contains(OrigPHI)) {
+            // Collect partial  reduction results.
+            if (PartialReductions.empty())
+              PartialReductions.push_back(cast<Instruction>(InVal));
+            PartialReductions.push_back(cast<Instruction>(VMap[InVal]));
+
+            // Update the start value for the cloned phis to use the identity
+            // value for the reduction.
+            const RecurrenceDescriptor &RdxDesc = Reductions[OrigPHI];
+            NewPHI->setIncomingValueForBlock(
+                L->getLoopPreheader(),
+                getRecurrenceIdentity(RdxDesc.getRecurrenceKind(),
+                                      OrigPHI->getType(),
+                                      RdxDesc.getFastMathFlags()));
+
+            // Update NewPHI to use the cloned value for the iteration and move
+            // to header.
+            NewPHI->replaceUsesOfWith(InVal, VMap[InVal]);
+            NewPHI->moveBefore(OrigPHI->getIterator());
+            continue;
+          }
+
           if (Instruction *InValI = dyn_cast<Instruction>(InVal))
             if (It > 1 && L->contains(InValI))
               InVal = LastValueMap[InValI];
@@ -832,6 +897,9 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
       PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader));
       PN->eraseFromParent();
     } else if (ULO.Count > 1) {
+      if (Reductions.contains(PN))
+        continue;
+
       Value *InVal = PN->removeIncomingValue(LatchBlock, false);
       // If this value was defined in the loop, take the value defined by the
       // last iteration of the loop.
@@ -1010,6 +1078,38 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
     }
   }
 
+  // If there are partial reductions, create code in the exit block to compute
+  // the final result and update users of the final result.
+  if (!PartialReductions.empty()) {
+    BasicBlock *ExitBlock = L->getExitBlock();
+    assert(ExitBlock &&
+           "Can only introduce parallel reduction phis with single exit block");
+    assert(Reductions.size() == 1 &&
+           "currently only a single reduction is supported");
+    Value *FinalRdxValue = PartialReductions.back();
+    Value *RdxResult = nullptr;
+    for (PHINode &Phi : ExitBlock->phis()) {
+      if (Phi.getIncomingValueForBlock(L->getLoopLatch()) != FinalRdxValue)
+        continue;
+      if (!RdxResult) {
+        RdxResult = PartialReductions.front();
+        IRBuilder Builder(ExitBlock, ExitBlock->getFirstNonPHIIt());
+        RecurKind RK = Reductions.begin()->second.getRecurrenceKind();
+        for (Instruction *RdxPart : drop_begin(PartialReductions)) {
+          RdxResult = Builder.CreateBinOp(
+              (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(RK),
+              RdxPart, RdxResult, "bin.rdx");
+        }
+        NeedToFixLCSSA = true;
+        for (Instruction *RdxPart : PartialReductions)
+          RdxPart->dropPoisonGeneratingFlags();
+      }
+
+      Phi.replaceAllUsesWith(RdxResult);
+      continue;
+    }
+  }
+
   if (DTUToUse) {
     // Apply updates to the DomTree.
     DT = &DTU.getDomTree();
@@ -1111,3 +1211,41 @@ MDNode *llvm::GetUnrollMetadata(MDNode *LoopID, StringRef Name) {
   }
   return nullptr;
 }
+
+std::optional<RecurrenceDescriptor>
+llvm::canParallelizeReductionWhenUnrolling(PHINode &Phi, Loop *L,
+                                           ScalarEvolution *SE) {
+  RecurrenceDescriptor RdxDesc;
+  if (!RecurrenceDescriptor::isReductionPHI(&Phi, L, RdxDesc,
+                                            /*DemandedBits=*/nullptr,
+                                            /*AC=*/nullptr, /*DT=*/nullptr, SE))
+    return std::nullopt;
+  RecurKind RK = RdxDesc.getRecurrenceKind();
+  // Skip unsupported reductions.
+  // TODO: Handle additional reductions, including FP and min-max
+  // reductions.
+  if (!RecurrenceDescriptor::isIntegerRecurrenceKind(RK) ||
+      RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
+      RecurrenceDescriptor::isFindIVRecurrenceKind(RK) ||
+      RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))
+    return std::nullopt;
+
+  if (RdxDesc.IntermediateStore)
+    return std::nullopt;
+
+  // Don't unroll reductions with constant ops; those can be folded to a
+  // single induction update.
+  if (any_of(cast<Instruction>(Phi.getIncomingValueForBlock(L->getLoopLatch()))
+                 ->operands(),
+             IsaPred<Constant>))
+    return std::nullopt;
+
+  BasicBlock *Latch = L->getLoopLatch();
+  if (!Latch ||
+      !is_contained(
+          cast<Instruction>(Phi.getIncomingValueForBlock(Latch))->operands(),
+          &Phi))
+    return std::nullopt;
+
+  return RdxDesc;
+}
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 843364eb34f8..b172ef6ba080 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -2032,6 +2032,7 @@ Value *llvm::addRuntimeChecks(
     MemoryRuntimeCheck = IsConflict;
   }
 
+  Exp.eraseDeadInstructions(MemoryRuntimeCheck);
   return MemoryRuntimeCheck;
 }
 
@@ -2077,6 +2078,7 @@ Value *llvm::addDiffRuntimeChecks(
     MemoryRuntimeCheck = IsConflict;
   }
 
+  Expander.eraseDeadInstructions(MemoryRuntimeCheck);
   return MemoryRuntimeCheck;
 }
 
diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
index 1711163fb9f5..ec2e6c1ab796 100644
--- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
@@ -81,6 +81,8 @@ void LoopVersioning::versionLoop(
   } else
     RuntimeCheck = MemRuntimeCheck ? MemRuntimeCheck : SCEVRuntimeCheck;
 
+  Exp.eraseDeadInstructions(SCEVRuntimeCheck);
+
   assert(RuntimeCheck && "called even though we don't need "
                          "any runtime checks");
 
diff --git a/llvm/lib/Transforms/Utils/ProfileVerify.cpp b/llvm/lib/Transforms/Utils/ProfileVerify.cpp
index 41647f7717a4..faacd422c009 100644
--- a/llvm/lib/Transforms/Utils/ProfileVerify.cpp
+++ b/llvm/lib/Transforms/Utils/ProfileVerify.cpp
@@ -155,12 +155,15 @@ PreservedAnalyses ProfileVerifierPass::run(Function &F,
                                            FunctionAnalysisManager &FAM) {
   const auto EntryCount = F.getEntryCount(/*AllowSynthetic=*/true);
   if (!EntryCount) {
-    F.getContext().emitError("Profile verification failed: function entry "
-                             "count missing (set to 0 if cold)");
+    auto *MD = F.getMetadata(LLVMContext::MD_prof);
+    if (!MD || !isExplicitlyUnknownProfileMetadata(*MD)) {
+      F.getContext().emitError("Profile verification failed: function entry "
+                               "count missing (set to 0 if cold)");
+      return PreservedAnalyses::all();
+    }
+  } else if (EntryCount->getCount() == 0) {
     return PreservedAnalyses::all();
   }
-  if (EntryCount->getCount() == 0)
-    return PreservedAnalyses::all();
   for (const auto &BB : F) {
     if (AnnotateSelect) {
       for (const auto &I : BB)
diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index 10c162bc6463..d93a4d87f30f 100644
--- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -849,9 +849,12 @@ void PromoteMem2Reg::run() {
   for (unsigned i = 0, e = Allocas.size(); i != e; ++i)
     IncomingVals.init(i, UndefValue::get(Allocas[i]->getAllocatedType()));
 
-  // When handling debug info, treat all incoming values as if they have unknown
-  // locations until proven otherwise.
+  // When handling debug info, treat all incoming values as if they have
+  // compiler-generated (empty) locations, representing the uninitialized
+  // alloca, until proven otherwise.
   IncomingLocs.resize(Allocas.size());
+  for (unsigned i = 0, e = Allocas.size(); i != e; ++i)
+    IncomingLocs.init(i, DebugLoc::getCompilerGenerated());
 
   // The renamer uses the Visited set to avoid infinite loops.
   Visited.resize(F.getMaxBlockNumber(), false);
diff --git a/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp b/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp
index d53a3144bf57..a814867652cd 100644
--- a/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp
+++ b/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp
@@ -21,29 +21,20 @@
 
 using namespace llvm;
 
-static bool shouldConvertToRelLookupTable(Module &M, GlobalVariable &GV) {
+struct LookupTableInfo {
+  Value *Index;
+  SmallVector<Constant *> Ptrs;
+};
+
+static bool shouldConvertToRelLookupTable(LookupTableInfo &Info, Module &M,
+                                          GlobalVariable &GV) {
   // If lookup table has more than one user,
   // do not generate a relative lookup table.
   // This is to simplify the analysis that needs to be done for this pass.
   // TODO: Add support for lookup tables with multiple uses.
   // For ex, this can happen when a function that uses a lookup table gets
   // inlined into multiple call sites.
-  if (!GV.hasInitializer() ||
-      !GV.isConstant() ||
-      !GV.hasOneUse())
-    return false;
-
-  GetElementPtrInst *GEP =
-      dyn_cast<GetElementPtrInst>(GV.use_begin()->getUser());
-  if (!GEP || !GEP->hasOneUse() ||
-      GV.getValueType() != GEP->getSourceElementType())
-    return false;
-
-  LoadInst *Load = dyn_cast<LoadInst>(GEP->use_begin()->getUser());
-  if (!Load || !Load->hasOneUse() ||
-      Load->getType() != GEP->getResultElementType())
-    return false;
-
+  //
   // If the original lookup table does not have local linkage and is
   // not dso_local, do not generate a relative lookup table.
   // This optimization creates a relative lookup table that consists of
@@ -51,21 +42,40 @@ static bool shouldConvertToRelLookupTable(Module &M, GlobalVariable &GV) {
   // To be able to generate these offsets, relative lookup table and
   // its elements should have internal linkage and be dso_local, which means
   // that they should resolve to symbols within the same linkage unit.
-  if (!GV.hasLocalLinkage() ||
-      !GV.isDSOLocal() ||
-      !GV.isImplicitDSOLocal())
+  if (!GV.hasInitializer() || !GV.isConstant() || !GV.hasOneUse() ||
+      !GV.hasLocalLinkage() || !GV.isDSOLocal() || !GV.isImplicitDSOLocal())
     return false;
 
-  ConstantArray *Array = dyn_cast<ConstantArray>(GV.getInitializer());
-  if (!Array)
+  auto *GEP = dyn_cast<GetElementPtrInst>(GV.use_begin()->getUser());
+  if (!GEP || !GEP->hasOneUse())
+    return false;
+
+  auto *Load = dyn_cast<LoadInst>(GEP->use_begin()->getUser());
+  if (!Load || !Load->hasOneUse())
     return false;
 
   // If values are not 64-bit pointers, do not generate a relative lookup table.
   const DataLayout &DL = M.getDataLayout();
-  Type *ElemType = Array->getType()->getElementType();
+  Type *ElemType = Load->getType();
   if (!ElemType->isPointerTy() || DL.getPointerTypeSizeInBits(ElemType) != 64)
     return false;
 
+  // Make sure this is a gep of the form GV + scale*var.
+  unsigned IndexWidth =
+      DL.getIndexTypeSizeInBits(Load->getPointerOperand()->getType());
+  SmallMapVector<Value *, APInt, 4> VarOffsets;
+  APInt ConstOffset(IndexWidth, 0);
+  if (!GEP->collectOffset(DL, IndexWidth, VarOffsets, ConstOffset) ||
+      !ConstOffset.isZero() || VarOffsets.size() != 1)
+    return false;
+
+  // This can't be a pointer lookup table if the stride is smaller than a
+  // pointer.
+  Info.Index = VarOffsets.front().first;
+  const APInt &Stride = VarOffsets.front().second;
+  if (Stride.ult(DL.getTypeStoreSize(ElemType)))
+    return false;
+
   SmallVector<GlobalVariable *, 4> GVOps;
   Triple TT = M.getTargetTriple();
   // FIXME: This should be removed in the future.
@@ -80,14 +90,20 @@ static bool shouldConvertToRelLookupTable(Module &M, GlobalVariable &GV) {
       // https://github.com/rust-lang/rust/issues/141306.
       || (TT.isX86() && TT.isOSDarwin());
 
-  for (const Use &Op : Array->operands()) {
-    Constant *ConstOp = cast<Constant>(&Op);
+  APInt Offset(IndexWidth, 0);
+  uint64_t GVSize = DL.getTypeAllocSize(GV.getValueType());
+  for (; Offset.ult(GVSize); Offset += Stride) {
+    Constant *C =
+        ConstantFoldLoadFromConst(GV.getInitializer(), ElemType, Offset, DL);
+    if (!C)
+      return false;
+
     GlobalValue *GVOp;
-    APInt Offset;
+    APInt GVOffset;
 
     // If an operand is not a constant offset from a lookup table,
     // do not generate a relative lookup table.
-    if (!IsConstantOffsetFromGlobal(ConstOp, GVOp, Offset, DL))
+    if (!IsConstantOffsetFromGlobal(C, GVOp, GVOffset, DL))
       return false;
 
     // If operand is mutable, do not generate a relative lookup table.
@@ -102,6 +118,8 @@ static bool shouldConvertToRelLookupTable(Module &M, GlobalVariable &GV) {
 
     if (ShouldDropUnnamedAddr)
       GVOps.push_back(GlovalVarOp);
+
+    Info.Ptrs.push_back(C);
   }
 
   if (ShouldDropUnnamedAddr)
@@ -111,14 +129,12 @@ static bool shouldConvertToRelLookupTable(Module &M, GlobalVariable &GV) {
   return true;
 }
 
-static GlobalVariable *createRelLookupTable(Function &Func,
+static GlobalVariable *createRelLookupTable(LookupTableInfo &Info,
+                                            Function &Func,
                                             GlobalVariable &LookupTable) {
   Module &M = *Func.getParent();
-  ConstantArray *LookupTableArr =
-      cast<ConstantArray>(LookupTable.getInitializer());
-  unsigned NumElts = LookupTableArr->getType()->getNumElements();
   ArrayType *IntArrayTy =
-      ArrayType::get(Type::getInt32Ty(M.getContext()), NumElts);
+      ArrayType::get(Type::getInt32Ty(M.getContext()), Info.Ptrs.size());
 
   GlobalVariable *RelLookupTable = new GlobalVariable(
       M, IntArrayTy, LookupTable.isConstant(), LookupTable.getLinkage(),
@@ -127,10 +143,9 @@ static GlobalVariable *createRelLookupTable(Function &Func,
       LookupTable.isExternallyInitialized());
 
   uint64_t Idx = 0;
-  SmallVector<Constant *, 64> RelLookupTableContents(NumElts);
+  SmallVector<Constant *, 64> RelLookupTableContents(Info.Ptrs.size());
 
-  for (Use &Operand : LookupTableArr->operands()) {
-    Constant *Element = cast<Constant>(Operand);
+  for (Constant *Element : Info.Ptrs) {
     Type *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext());
     Constant *Base = llvm::ConstantExpr::getPtrToInt(RelLookupTable, IntPtrTy);
     Constant *Target = llvm::ConstantExpr::getPtrToInt(Element, IntPtrTy);
@@ -148,7 +163,8 @@ static GlobalVariable *createRelLookupTable(Function &Func,
   return RelLookupTable;
 }
 
-static void convertToRelLookupTable(GlobalVariable &LookupTable) {
+static void convertToRelLookupTable(LookupTableInfo &Info,
+                                    GlobalVariable &LookupTable) {
   GetElementPtrInst *GEP =
       cast<GetElementPtrInst>(LookupTable.use_begin()->getUser());
   LoadInst *Load = cast<LoadInst>(GEP->use_begin()->getUser());
@@ -159,21 +175,21 @@ static void convertToRelLookupTable(GlobalVariable &LookupTable) {
   Function &Func = *BB->getParent();
 
   // Generate an array that consists of relative offsets.
-  GlobalVariable *RelLookupTable = createRelLookupTable(Func, LookupTable);
+  GlobalVariable *RelLookupTable =
+      createRelLookupTable(Info, Func, LookupTable);
 
   // Place new instruction sequence before GEP.
   Builder.SetInsertPoint(GEP);
-  Value *Index = GEP->getOperand(2);
-  IntegerType *IntTy = cast<IntegerType>(Index->getType());
-  Value *Offset =
-      Builder.CreateShl(Index, ConstantInt::get(IntTy, 2), "reltable.shift");
+  IntegerType *IntTy = cast<IntegerType>(Info.Index->getType());
+  Value *Offset = Builder.CreateShl(Info.Index, ConstantInt::get(IntTy, 2),
+                                    "reltable.shift");
 
   // Insert the call to load.relative intrinsic before LOAD.
   // GEP might not be immediately followed by a LOAD, like it can be hoisted
   // outside the loop or another instruction might be inserted them in between.
   Builder.SetInsertPoint(Load);
   Function *LoadRelIntrinsic = llvm::Intrinsic::getOrInsertDeclaration(
-      &M, Intrinsic::load_relative, {Index->getType()});
+      &M, Intrinsic::load_relative, {Info.Index->getType()});
 
   // Create a call to load.relative intrinsic that computes the target address
   // by adding base address (lookup table address) and relative offset.
@@ -205,10 +221,11 @@ static bool convertToRelativeLookupTables(
   bool Changed = false;
 
   for (GlobalVariable &GV : llvm::make_early_inc_range(M.globals())) {
-    if (!shouldConvertToRelLookupTable(M, GV))
+    LookupTableInfo Info;
+    if (!shouldConvertToRelLookupTable(Info, M, GV))
       continue;
 
-    convertToRelLookupTable(GV);
+    convertToRelLookupTable(Info, GV);
 
     // Remove the original lookup table.
     GV.eraseFromParent();
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 060ca92e559a..28befd0aa1ce 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -26,6 +26,7 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 
 #if LLVM_ENABLE_ABI_BREAKING_CHECKS
@@ -175,6 +176,26 @@ SCEVExpander::findInsertPointAfter(Instruction *I,
   return IP;
 }
 
+void SCEVExpander::eraseDeadInstructions(Value *Root) {
+  SmallVector<Value *> WorkList;
+  SmallPtrSet<Value *, 8> DeletedValues;
+  append_range(WorkList, getAllInsertedInstructions());
+  while (!WorkList.empty()) {
+    Value *V = WorkList.pop_back_val();
+    if (DeletedValues.contains(V))
+      continue;
+    auto *I = dyn_cast<Instruction>(V);
+    if (!I || I == Root || !isInsertedInstruction(I) ||
+        !isInstructionTriviallyDead(I))
+      continue;
+    append_range(WorkList, I->operands());
+    InsertedValues.erase(I);
+    InsertedPostIncValues.erase(I);
+    DeletedValues.insert(I);
+    I->eraseFromParent();
+  }
+}
+
 BasicBlock::iterator
 SCEVExpander::GetOptimalInsertionPointForCastOf(Value *V) const {
   // Cast the argument at the beginning of the entry block, after
@@ -1239,10 +1260,13 @@ Value *SCEVExpander::tryToReuseLCSSAPhi(const SCEVAddRecExpr *S) {
     if (!isa<SCEVAddRecExpr>(ExitSCEV))
       continue;
     Type *PhiTy = PN.getType();
-    if (STy->isIntegerTy() && PhiTy->isPointerTy())
+    if (STy->isIntegerTy() && PhiTy->isPointerTy()) {
       ExitSCEV = SE.getPtrToIntExpr(ExitSCEV, STy);
-    else if (S->getType() != PN.getType())
+      if (isa<SCEVCouldNotCompute>(ExitSCEV))
+        continue;
+    } else if (S->getType() != PN.getType()) {
       continue;
+    }
 
     // Check if we can re-use the existing PN, by adjusting it with an expanded
     // offset, if the offset is simpler.
@@ -2184,8 +2208,15 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR,
   // negative. If Step is known to be positive or negative, only create
   // either 1. or 2.
   auto ComputeEndCheck = [&]() -> Value * {
-    // Checking <u 0 is always false.
-    if (!Signed && Start->isZero() && SE.isKnownPositive(Step))
+    // Checking <u 0 is always false, if (Step * trunc ExitCount) does not wrap.
+    // TODO: Predicates that can be proven true/false should be discarded when
+    // the predicates are created, not late during expansion.
+    if (!Signed && Start->isZero() && SE.isKnownPositive(Step) &&
+        DstBits < SrcBits &&
+        ExitCount == SE.getZeroExtendExpr(SE.getTruncateExpr(ExitCount, ARTy),
+                                          ExitCount->getType()) &&
+        SE.willNotOverflow(Instruction::Mul, Signed, Step,
+                           SE.getTruncateExpr(ExitCount, ARTy)))
       return ConstantInt::getFalse(Loc->getContext());
 
     // Get the backedge taken count and truncate or extended to the AR type.
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 7a538ae2c583..970f85378d3d 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -612,6 +612,18 @@ private:
   /// If CompValue is already set, the function is expected to fail if a match
   /// is found but the value compared to is different.
   bool matchInstruction(Instruction *I, bool isEQ) {
+    if (match(I, m_Not(m_Instruction(I))))
+      isEQ = !isEQ;
+
+    Value *Val;
+    if (match(I, m_NUWTrunc(m_Value(Val)))) {
+      // If we already have a value for the switch, it has to match!
+      if (!setValueOnce(Val))
+        return false;
+      UsedICmps++;
+      Vals.push_back(ConstantInt::get(cast<IntegerType>(Val->getType()), isEQ));
+      return true;
+    }
     // If this is an icmp against a constant, handle this as one of the cases.
     ICmpInst *ICI;
     ConstantInt *C;
@@ -2260,10 +2272,6 @@ static bool canSinkInstructions(
 
   for (unsigned OI = 0, OE = I0->getNumOperands(); OI != OE; ++OI) {
     Value *Op = I0->getOperand(OI);
-    if (Op->getType()->isTokenTy())
-      // Don't touch any operand of token type.
-      return false;
-
     auto SameAsI0 = [&I0, OI](const Instruction *I) {
       assert(I->getNumOperands() == I0->getNumOperands());
       return I->getOperand(OI) == I0->getOperand(OI);
@@ -2764,8 +2772,7 @@ bool CompatibleSets::shouldBelongToSameSet(ArrayRef<InvokeInst *> Invokes) {
     Use &U1 = std::get<1>(Ops);
     if (U0 == U1)
       return false;
-    return U0->getType()->isTokenTy() ||
-           !canReplaceOperandWithVariable(cast<Instruction>(U0.getUser()),
+    return !canReplaceOperandWithVariable(cast<Instruction>(U0.getUser()),
                                           U0.getOperandNo());
   };
   assert(Invokes.size() == 2 && "Always called with exactly two candidates.");
@@ -4404,10 +4411,12 @@ static bool mergeConditionalStoreToAddress(
 
   // OK, we're going to sink the stores to PostBB. The store has to be
   // conditional though, so first create the predicate.
-  Value *PCond = cast<BranchInst>(PFB->getSinglePredecessor()->getTerminator())
-                     ->getCondition();
-  Value *QCond = cast<BranchInst>(QFB->getSinglePredecessor()->getTerminator())
-                     ->getCondition();
+  BranchInst *PBranch =
+      cast<BranchInst>(PFB->getSinglePredecessor()->getTerminator());
+  BranchInst *QBranch =
+      cast<BranchInst>(QFB->getSinglePredecessor()->getTerminator());
+  Value *PCond = PBranch->getCondition();
+  Value *QCond = QBranch->getCondition();
 
   Value *PPHI = ensureValueAvailableInSuccessor(PStore->getValueOperand(),
                                                 PStore->getParent());
@@ -4418,13 +4427,11 @@ static bool mergeConditionalStoreToAddress(
   IRBuilder<> QB(PostBB, PostBBFirst);
   QB.SetCurrentDebugLocation(PostBBFirst->getStableDebugLoc());
 
-  Value *PPred = PStore->getParent() == PTB ? PCond : QB.CreateNot(PCond);
-  Value *QPred = QStore->getParent() == QTB ? QCond : QB.CreateNot(QCond);
+  InvertPCond ^= (PStore->getParent() != PTB);
+  InvertQCond ^= (QStore->getParent() != QTB);
+  Value *PPred = InvertPCond ? QB.CreateNot(PCond) : PCond;
+  Value *QPred = InvertQCond ? QB.CreateNot(QCond) : QCond;
 
-  if (InvertPCond)
-    PPred = QB.CreateNot(PPred);
-  if (InvertQCond)
-    QPred = QB.CreateNot(QPred);
   Value *CombinedPred = QB.CreateOr(PPred, QPred);
 
   BasicBlock::iterator InsertPt = QB.GetInsertPoint();
@@ -4808,23 +4815,12 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
       SelectInst *NV = cast<SelectInst>(
           Builder.CreateSelect(PBICond, PBIV, BIV, PBIV->getName() + ".mux"));
       PN.setIncomingValue(PBBIdx, NV);
-      // Although the select has the same condition as PBI, the original branch
-      // weights for PBI do not apply to the new select because the select's
-      // 'logical' edges are incoming edges of the phi that is eliminated, not
-      // the outgoing edges of PBI.
+      // The select has the same condition as PBI, in the same BB. The
+      // probabilities don't change.
       if (HasWeights) {
-        uint64_t PredCommon = PBIOp ? PredFalseWeight : PredTrueWeight;
-        uint64_t PredOther = PBIOp ? PredTrueWeight : PredFalseWeight;
-        uint64_t SuccCommon = BIOp ? SuccFalseWeight : SuccTrueWeight;
-        uint64_t SuccOther = BIOp ? SuccTrueWeight : SuccFalseWeight;
-        // The weight to PredCommonDest should be PredCommon * SuccTotal.
-        // The weight to PredOtherDest should be PredOther * SuccCommon.
-        uint64_t NewWeights[2] = {PredCommon * (SuccCommon + SuccOther),
-                                  PredOther * SuccCommon};
-
-        fitWeights(NewWeights);
-
-        setBranchWeights(NV, NewWeights[0], NewWeights[1],
+        uint64_t TrueWeight = PBIOp ? PredFalseWeight : PredTrueWeight;
+        uint64_t FalseWeight = PBIOp ? PredTrueWeight : PredFalseWeight;
+        setBranchWeights(NV, TrueWeight, FalseWeight,
                          /*IsExpected=*/false);
       }
     }
@@ -6437,34 +6433,42 @@ static bool trySwitchToSelect(SwitchInst *SI, IRBuilder<> &Builder,
 
 namespace {
 
-/// This class represents a lookup table that can be used to replace a switch.
-class SwitchLookupTable {
+/// This class finds alternatives for switches to ultimately
+/// replace the switch.
+class SwitchReplacement {
 public:
-  /// Create a lookup table to use as a switch replacement with the contents
-  /// of Values, using DefaultValue to fill any holes in the table.
-  SwitchLookupTable(
+  /// Create a helper for optimizations to use as a switch replacement.
+  /// Find a better representation for the content of Values,
+  /// using DefaultValue to fill any holes in the table.
+  SwitchReplacement(
       Module &M, uint64_t TableSize, ConstantInt *Offset,
       const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values,
       Constant *DefaultValue, const DataLayout &DL, const StringRef &FuncName);
 
-  /// Build instructions with Builder to retrieve the value at
-  /// the position given by Index in the lookup table.
-  Value *buildLookup(Value *Index, IRBuilder<> &Builder, const DataLayout &DL);
+  /// Build instructions with Builder to retrieve values using Index
+  /// and replace the switch.
+  Value *replaceSwitch(Value *Index, IRBuilder<> &Builder, const DataLayout &DL,
+                       Function *Func);
 
   /// Return true if a table with TableSize elements of
   /// type ElementType would fit in a target-legal register.
   static bool wouldFitInRegister(const DataLayout &DL, uint64_t TableSize,
                                  Type *ElementType);
 
+  /// Return the default value of the switch.
+  Constant *getDefaultValue();
+
+  /// Return true if the replacement is a lookup table.
+  bool isLookupTable();
+
 private:
-  // Depending on the contents of the table, it can be represented in
-  // different ways.
+  // Depending on the switch, there are different alternatives.
   enum {
-    // For tables where each element contains the same value, we just have to
+    // For switches where each case contains the same value, we just have to
     // store that single value and return it for each lookup.
     SingleValueKind,
 
-    // For tables where there is a linear relationship between table index
+    // For switches where there is a linear relationship between table index
     // and values. We calculate the result with a simple multiplication
     // and addition instead of a table lookup.
     LinearMapKind,
@@ -6476,9 +6480,15 @@ private:
 
     // The table is stored as an array of values. Values are retrieved by load
     // instructions from the table.
-    ArrayKind
+    LookupTableKind
   } Kind;
 
+  // The default value of the switch.
+  Constant *DefaultValue;
+
+  // The type of the output values.
+  Type *ValueType;
+
   // For SingleValueKind, this is the single value.
   Constant *SingleValue = nullptr;
 
@@ -6491,23 +6501,24 @@ private:
   ConstantInt *LinearMultiplier = nullptr;
   bool LinearMapValWrapped = false;
 
-  // For ArrayKind, this is the array.
-  GlobalVariable *Array = nullptr;
+  // For LookupTableKind, this is the table.
+  Constant *Initializer = nullptr;
 };
 
 } // end anonymous namespace
 
-SwitchLookupTable::SwitchLookupTable(
+SwitchReplacement::SwitchReplacement(
     Module &M, uint64_t TableSize, ConstantInt *Offset,
     const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values,
-    Constant *DefaultValue, const DataLayout &DL, const StringRef &FuncName) {
+    Constant *DefaultValue, const DataLayout &DL, const StringRef &FuncName)
+    : DefaultValue(DefaultValue) {
   assert(Values.size() && "Can't build lookup table without values!");
   assert(TableSize >= Values.size() && "Can't fit values in table!");
 
   // If all values in the table are equal, this is that value.
   SingleValue = Values.begin()->second;
 
-  Type *ValueType = Values.begin()->second->getType();
+  ValueType = Values.begin()->second->getType();
 
   // Build up the table contents.
   SmallVector<Constant *, 64> TableContents(TableSize);
@@ -6597,7 +6608,6 @@ SwitchLookupTable::SwitchLookupTable(
         (void)M.smul_ov(APInt(M.getBitWidth(), TableSize - 1), MayWrap);
       LinearMapValWrapped = NonMonotonic || MayWrap;
       Kind = LinearMapKind;
-      ++NumLinearMaps;
       return;
     }
   }
@@ -6617,30 +6627,23 @@ SwitchLookupTable::SwitchLookupTable(
     BitMap = ConstantInt::get(M.getContext(), TableInt);
     BitMapElementTy = IT;
     Kind = BitMapKind;
-    ++NumBitMaps;
     return;
   }
 
   // Store the table in an array.
-  ArrayType *ArrayTy = ArrayType::get(ValueType, TableSize);
-  Constant *Initializer = ConstantArray::get(ArrayTy, TableContents);
-
-  Array = new GlobalVariable(M, ArrayTy, /*isConstant=*/true,
-                             GlobalVariable::PrivateLinkage, Initializer,
-                             "switch.table." + FuncName);
-  Array->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
-  // Set the alignment to that of an array items. We will be only loading one
-  // value out of it.
-  Array->setAlignment(DL.getPrefTypeAlign(ValueType));
-  Kind = ArrayKind;
+  auto *TableTy = ArrayType::get(ValueType, TableSize);
+  Initializer = ConstantArray::get(TableTy, TableContents);
+
+  Kind = LookupTableKind;
 }
 
-Value *SwitchLookupTable::buildLookup(Value *Index, IRBuilder<> &Builder,
-                                      const DataLayout &DL) {
+Value *SwitchReplacement::replaceSwitch(Value *Index, IRBuilder<> &Builder,
+                                        const DataLayout &DL, Function *Func) {
   switch (Kind) {
   case SingleValueKind:
     return SingleValue;
   case LinearMapKind: {
+    ++NumLinearMaps;
     // Derive the result value from the input value.
     Value *Result = Builder.CreateIntCast(Index, LinearMultiplier->getType(),
                                           false, "switch.idx.cast");
@@ -6656,6 +6659,7 @@ Value *SwitchLookupTable::buildLookup(Value *Index, IRBuilder<> &Builder,
     return Result;
   }
   case BitMapKind: {
+    ++NumBitMaps;
     // Type of the bitmap (e.g. i59).
     IntegerType *MapTy = BitMap->getIntegerType();
 
@@ -6677,9 +6681,18 @@ Value *SwitchLookupTable::buildLookup(Value *Index, IRBuilder<> &Builder,
     // Mask off.
     return Builder.CreateTrunc(DownShifted, BitMapElementTy, "switch.masked");
   }
-  case ArrayKind: {
-    Type *IndexTy = DL.getIndexType(Array->getType());
-    auto *ArrayTy = cast<ArrayType>(Array->getValueType());
+  case LookupTableKind: {
+    ++NumLookupTables;
+    auto *Table =
+        new GlobalVariable(*Func->getParent(), Initializer->getType(),
+                           /*isConstant=*/true, GlobalVariable::PrivateLinkage,
+                           Initializer, "switch.table." + Func->getName());
+    Table->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+    // Set the alignment to that of an array items. We will be only loading one
+    // value out of it.
+    Table->setAlignment(DL.getPrefTypeAlign(ValueType));
+    Type *IndexTy = DL.getIndexType(Table->getType());
+    auto *ArrayTy = cast<ArrayType>(Table->getValueType());
 
     if (Index->getType() != IndexTy) {
       unsigned OldBitWidth = Index->getType()->getIntegerBitWidth();
@@ -6691,14 +6704,14 @@ Value *SwitchLookupTable::buildLookup(Value *Index, IRBuilder<> &Builder,
 
     Value *GEPIndices[] = {ConstantInt::get(IndexTy, 0), Index};
     Value *GEP =
-        Builder.CreateInBoundsGEP(ArrayTy, Array, GEPIndices, "switch.gep");
+        Builder.CreateInBoundsGEP(ArrayTy, Table, GEPIndices, "switch.gep");
     return Builder.CreateLoad(ArrayTy->getElementType(), GEP, "switch.load");
   }
   }
-  llvm_unreachable("Unknown lookup table kind!");
+  llvm_unreachable("Unknown helper kind!");
 }
 
-bool SwitchLookupTable::wouldFitInRegister(const DataLayout &DL,
+bool SwitchReplacement::wouldFitInRegister(const DataLayout &DL,
                                            uint64_t TableSize,
                                            Type *ElementType) {
   auto *IT = dyn_cast<IntegerType>(ElementType);
@@ -6734,6 +6747,10 @@ static bool isTypeLegalForLookupTable(Type *Ty, const TargetTransformInfo &TTI,
          DL.fitsInLegalInteger(IT->getBitWidth());
 }
 
+Constant *SwitchReplacement::getDefaultValue() { return DefaultValue; }
+
+bool SwitchReplacement::isLookupTable() { return Kind == LookupTableKind; }
+
 static bool isSwitchDense(uint64_t NumCases, uint64_t CaseRange) {
   // 40% is the default density for building a jump table in optsize/minsize
   // mode. See also TargetLoweringBase::isSuitableForJumpTable(), which this
@@ -6760,25 +6777,23 @@ static bool isSwitchDense(ArrayRef<int64_t> Values) {
 // TODO: We could support larger than legal types by limiting based on the
 // number of loads required and/or table size. If the constants are small we
 // could use smaller table entries and extend after the load.
-static bool
-shouldBuildLookupTable(SwitchInst *SI, uint64_t TableSize,
-                       const TargetTransformInfo &TTI, const DataLayout &DL,
-                       const SmallDenseMap<PHINode *, Type *> &ResultTypes) {
+static bool shouldBuildLookupTable(SwitchInst *SI, uint64_t TableSize,
+                                   const TargetTransformInfo &TTI,
+                                   const DataLayout &DL,
+                                   const SmallVector<Type *> &ResultTypes) {
   if (SI->getNumCases() > TableSize)
     return false; // TableSize overflowed.
 
   bool AllTablesFitInRegister = true;
   bool HasIllegalType = false;
-  for (const auto &I : ResultTypes) {
-    Type *Ty = I.second;
-
+  for (const auto &Ty : ResultTypes) {
     // Saturate this flag to true.
     HasIllegalType = HasIllegalType || !isTypeLegalForLookupTable(Ty, TTI, DL);
 
     // Saturate this flag to false.
     AllTablesFitInRegister =
         AllTablesFitInRegister &&
-        SwitchLookupTable::wouldFitInRegister(DL, TableSize, Ty);
+        SwitchReplacement::wouldFitInRegister(DL, TableSize, Ty);
 
     // If both flags saturate, we're done. NOTE: This *only* works with
     // saturating flags, and all flags have to saturate first due to the
@@ -6800,7 +6815,7 @@ shouldBuildLookupTable(SwitchInst *SI, uint64_t TableSize,
 
 static bool shouldUseSwitchConditionAsTableIndex(
     ConstantInt &MinCaseVal, const ConstantInt &MaxCaseVal,
-    bool HasDefaultResults, const SmallDenseMap<PHINode *, Type *> &ResultTypes,
+    bool HasDefaultResults, const SmallVector<Type *> &ResultTypes,
     const DataLayout &DL, const TargetTransformInfo &TTI) {
   if (MinCaseVal.isNullValue())
     return true;
@@ -6808,10 +6823,9 @@ static bool shouldUseSwitchConditionAsTableIndex(
       MaxCaseVal.getLimitedValue() == std::numeric_limits<uint64_t>::max() ||
       !HasDefaultResults)
     return false;
-  return all_of(ResultTypes, [&](const auto &KV) {
-    return SwitchLookupTable::wouldFitInRegister(
-        DL, MaxCaseVal.getLimitedValue() + 1 /* TableSize */,
-        KV.second /* ResultType */);
+  return all_of(ResultTypes, [&](const auto &ResultType) {
+    return SwitchReplacement::wouldFitInRegister(
+        DL, MaxCaseVal.getLimitedValue() + 1 /* TableSize */, ResultType);
   });
 }
 
@@ -6900,18 +6914,13 @@ static void reuseTableCompare(
 /// If the switch is only used to initialize one or more phi nodes in a common
 /// successor block with different constant values, replace the switch with
 /// lookup tables.
-static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
-                                DomTreeUpdater *DTU, const DataLayout &DL,
-                                const TargetTransformInfo &TTI) {
+static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder,
+                                 DomTreeUpdater *DTU, const DataLayout &DL,
+                                 const TargetTransformInfo &TTI) {
   assert(SI->getNumCases() > 1 && "Degenerate switch?");
 
   BasicBlock *BB = SI->getParent();
   Function *Fn = BB->getParent();
-  // Only build lookup table when we have a target that supports it or the
-  // attribute is not set.
-  if (!TTI.shouldBuildLookupTables() ||
-      (Fn->getFnAttribute("no-jump-tables").getValueAsBool()))
-    return false;
 
   // FIXME: If the switch is too sparse for a lookup table, perhaps we could
   // split off a dense part and build a lookup table for that.
@@ -6938,7 +6947,7 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
   SmallDenseMap<PHINode *, ResultListTy> ResultLists;
 
   SmallDenseMap<PHINode *, Constant *> DefaultResults;
-  SmallDenseMap<PHINode *, Type *> ResultTypes;
+  SmallVector<Type *> ResultTypes;
   SmallVector<PHINode *, 4> PHIs;
 
   for (SwitchInst::CaseIt E = SI->case_end(); CI != E; ++CI) {
@@ -6955,7 +6964,8 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
                         Results, DL, TTI))
       return false;
 
-    // Append the result from this case to the list for each phi.
+    // Append the result and result types from this case to the list for each
+    // phi.
     for (const auto &I : Results) {
       PHINode *PHI = I.first;
       Constant *Value = I.second;
@@ -6963,23 +6973,16 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
       if (Inserted)
         PHIs.push_back(PHI);
       It->second.push_back(std::make_pair(CaseVal, Value));
+      ResultTypes.push_back(PHI->getType());
     }
   }
 
-  // Keep track of the result types.
-  for (PHINode *PHI : PHIs) {
-    ResultTypes[PHI] = ResultLists[PHI][0].second->getType();
-  }
-
-  uint64_t NumResults = ResultLists[PHIs[0]].size();
-
   // If the table has holes, we need a constant result for the default case
   // or a bitmask that fits in a register.
   SmallVector<std::pair<PHINode *, Constant *>, 4> DefaultResultsList;
   bool HasDefaultResults =
       getCaseResults(SI, nullptr, SI->getDefaultDest(), &CommonDest,
                      DefaultResultsList, DL, TTI);
-
   for (const auto &I : DefaultResultsList) {
     PHINode *PHI = I.first;
     Constant *Result = I.second;
@@ -6989,15 +6992,21 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
   bool UseSwitchConditionAsTableIndex = shouldUseSwitchConditionAsTableIndex(
       *MinCaseVal, *MaxCaseVal, HasDefaultResults, ResultTypes, DL, TTI);
   uint64_t TableSize;
-  if (UseSwitchConditionAsTableIndex)
+  ConstantInt *TableIndexOffset;
+  if (UseSwitchConditionAsTableIndex) {
     TableSize = MaxCaseVal->getLimitedValue() + 1;
-  else
+    TableIndexOffset = ConstantInt::get(MaxCaseVal->getIntegerType(), 0);
+  } else {
     TableSize =
         (MaxCaseVal->getValue() - MinCaseVal->getValue()).getLimitedValue() + 1;
 
+    TableIndexOffset = MinCaseVal;
+  }
+
   // If the default destination is unreachable, or if the lookup table covers
   // all values of the conditional variable, branch directly to the lookup table
   // BB. Otherwise, check that the condition is within the case range.
+  uint64_t NumResults = ResultLists[PHIs[0]].size();
   bool DefaultIsReachable = !SI->defaultDestUnreachable();
 
   bool TableHasHoles = (NumResults < TableSize);
@@ -7025,68 +7034,100 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
   if (!shouldBuildLookupTable(SI, TableSize, TTI, DL, ResultTypes))
     return false;
 
-  std::vector<DominatorTree::UpdateType> Updates;
-
-  // Compute the maximum table size representable by the integer type we are
-  // switching upon.
-  unsigned CaseSize = MinCaseVal->getType()->getPrimitiveSizeInBits();
-  uint64_t MaxTableSize = CaseSize > 63 ? UINT64_MAX : 1ULL << CaseSize;
-  assert(MaxTableSize >= TableSize &&
-         "It is impossible for a switch to have more entries than the max "
-         "representable value of its input integer type's size.");
-
-  // Create the BB that does the lookups.
-  Module &Mod = *CommonDest->getParent()->getParent();
-  BasicBlock *LookupBB = BasicBlock::Create(
-      Mod.getContext(), "switch.lookup", CommonDest->getParent(), CommonDest);
-
   // Compute the table index value.
-  Builder.SetInsertPoint(SI);
   Value *TableIndex;
-  ConstantInt *TableIndexOffset;
   if (UseSwitchConditionAsTableIndex) {
-    TableIndexOffset = ConstantInt::get(MaxCaseVal->getIntegerType(), 0);
     TableIndex = SI->getCondition();
-  } else {
-    TableIndexOffset = MinCaseVal;
+    if (HasDefaultResults) {
+      // Grow the table to cover all possible index values to avoid the range
+      // check. It will use the default result to fill in the table hole later,
+      // so make sure it exist.
+      ConstantRange CR =
+          computeConstantRange(TableIndex, /* ForSigned */ false);
+      // Grow the table shouldn't have any size impact by checking
+      // wouldFitInRegister.
+      // TODO: Consider growing the table also when it doesn't fit in a register
+      // if no optsize is specified.
+      const uint64_t UpperBound = CR.getUpper().getLimitedValue();
+      if (!CR.isUpperWrapped() &&
+          all_of(ResultTypes, [&](const auto &ResultType) {
+            return SwitchReplacement::wouldFitInRegister(DL, UpperBound,
+                                                         ResultType);
+          })) {
+        // There may be some case index larger than the UpperBound (unreachable
+        // case), so make sure the table size does not get smaller.
+        TableSize = std::max(UpperBound, TableSize);
+        // The default branch is unreachable after we enlarge the lookup table.
+        // Adjust DefaultIsReachable to reuse code path.
+        DefaultIsReachable = false;
+      }
+    }
+  }
+
+  // Keep track of the switch replacement for each phi
+  SmallDenseMap<PHINode *, SwitchReplacement> PhiToReplacementMap;
+  for (PHINode *PHI : PHIs) {
+    const auto &ResultList = ResultLists[PHI];
+
+    Type *ResultType = ResultList.begin()->second->getType();
+    // Use any value to fill the lookup table holes.
+    Constant *DefaultVal =
+        AllHolesArePoison ? PoisonValue::get(ResultType) : DefaultResults[PHI];
+    StringRef FuncName = Fn->getName();
+    SwitchReplacement Replacement(*Fn->getParent(), TableSize, TableIndexOffset,
+                                  ResultList, DefaultVal, DL, FuncName);
+    PhiToReplacementMap.insert({PHI, Replacement});
+  }
+
+  bool AnyLookupTables = any_of(
+      PhiToReplacementMap, [](auto &KV) { return KV.second.isLookupTable(); });
+
+  // A few conditions prevent the generation of lookup tables:
+  //     1. The target does not support lookup tables.
+  //     2. The "no-jump-tables" function attribute is set.
+  // However, these objections do not apply to other switch replacements, like
+  // the bitmap, so we only stop here if any of these conditions are met and we
+  // want to create a LUT. Otherwise, continue with the switch replacement.
+  if (AnyLookupTables &&
+      (!TTI.shouldBuildLookupTables() ||
+       Fn->getFnAttribute("no-jump-tables").getValueAsBool()))
+    return false;
+
+  Builder.SetInsertPoint(SI);
+  // TableIndex is the switch condition - TableIndexOffset if we don't
+  // use the condition directly
+  if (!UseSwitchConditionAsTableIndex) {
     // If the default is unreachable, all case values are s>= MinCaseVal. Then
     // we can try to attach nsw.
     bool MayWrap = true;
     if (!DefaultIsReachable) {
-      APInt Res = MaxCaseVal->getValue().ssub_ov(MinCaseVal->getValue(), MayWrap);
+      APInt Res =
+          MaxCaseVal->getValue().ssub_ov(MinCaseVal->getValue(), MayWrap);
       (void)Res;
     }
-
     TableIndex = Builder.CreateSub(SI->getCondition(), TableIndexOffset,
                                    "switch.tableidx", /*HasNUW =*/false,
                                    /*HasNSW =*/!MayWrap);
   }
 
-  BranchInst *RangeCheckBranch = nullptr;
+  std::vector<DominatorTree::UpdateType> Updates;
 
-  // Grow the table to cover all possible index values to avoid the range check.
-  // It will use the default result to fill in the table hole later, so make
-  // sure it exist.
-  if (UseSwitchConditionAsTableIndex && HasDefaultResults) {
-    ConstantRange CR = computeConstantRange(TableIndex, /* ForSigned */ false);
-    // Grow the table shouldn't have any size impact by checking
-    // wouldFitInRegister.
-    // TODO: Consider growing the table also when it doesn't fit in a register
-    // if no optsize is specified.
-    const uint64_t UpperBound = CR.getUpper().getLimitedValue();
-    if (!CR.isUpperWrapped() && all_of(ResultTypes, [&](const auto &KV) {
-          return SwitchLookupTable::wouldFitInRegister(
-              DL, UpperBound, KV.second /* ResultType */);
-        })) {
-      // There may be some case index larger than the UpperBound (unreachable
-      // case), so make sure the table size does not get smaller.
-      TableSize = std::max(UpperBound, TableSize);
-      // The default branch is unreachable after we enlarge the lookup table.
-      // Adjust DefaultIsReachable to reuse code path.
-      DefaultIsReachable = false;
-    }
-  }
+  // Compute the maximum table size representable by the integer type we are
+  // switching upon.
+  unsigned CaseSize = MinCaseVal->getType()->getPrimitiveSizeInBits();
+  uint64_t MaxTableSize = CaseSize > 63 ? UINT64_MAX : 1ULL << CaseSize;
+  assert(MaxTableSize >= TableSize &&
+         "It is impossible for a switch to have more entries than the max "
+         "representable value of its input integer type's size.");
+
+  // Create the BB that does the lookups.
+  Module &Mod = *CommonDest->getParent()->getParent();
+  BasicBlock *LookupBB = BasicBlock::Create(
+      Mod.getContext(), "switch.lookup", CommonDest->getParent(), CommonDest);
+
+  BranchInst *RangeCheckBranch = nullptr;
 
+  Builder.SetInsertPoint(SI);
   const bool GeneratingCoveredLookupTable = (MaxTableSize == TableSize);
   if (!DefaultIsReachable || GeneratingCoveredLookupTable) {
     Builder.CreateBr(LookupBB);
@@ -7157,25 +7198,16 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
 
   for (PHINode *PHI : PHIs) {
     const ResultListTy &ResultList = ResultLists[PHI];
-
-    Type *ResultType = ResultList.begin()->second->getType();
-
-    // Use any value to fill the lookup table holes.
-    Constant *DV =
-        AllHolesArePoison ? PoisonValue::get(ResultType) : DefaultResults[PHI];
-    StringRef FuncName = Fn->getName();
-    SwitchLookupTable Table(Mod, TableSize, TableIndexOffset, ResultList, DV,
-                            DL, FuncName);
-
-    Value *Result = Table.buildLookup(TableIndex, Builder, DL);
-
+    auto Replacement = PhiToReplacementMap.at(PHI);
+    auto *Result = Replacement.replaceSwitch(TableIndex, Builder, DL, Fn);
     // Do a small peephole optimization: re-use the switch table compare if
     // possible.
     if (!TableHasHoles && HasDefaultResults && RangeCheckBranch) {
       BasicBlock *PhiBlock = PHI->getParent();
       // Search for compare instructions which use the phi.
       for (auto *User : PHI->users()) {
-        reuseTableCompare(User, PhiBlock, RangeCheckBranch, DV, ResultList);
+        reuseTableCompare(User, PhiBlock, RangeCheckBranch,
+                          Replacement.getDefaultValue(), ResultList);
       }
     }
 
@@ -7202,7 +7234,6 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
   if (DTU)
     DTU->applyUpdates(Updates);
 
-  ++NumLookupTables;
   if (NeedMask)
     ++NumLookupTablesHoles;
   return true;
@@ -7708,7 +7739,7 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
   // CVP. Therefore, only apply this transformation during late stages of the
   // optimisation pipeline.
   if (Options.ConvertSwitchToLookupTable &&
-      switchToLookupTable(SI, Builder, DTU, DL, TTI))
+      simplifySwitchLookup(SI, Builder, DTU, DL, TTI))
     return requestResimplify();
 
   if (simplifySwitchOfPowersOfTwo(SI, Builder, DL, TTI))
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 2d6a748f4507..8acebbaa5458 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -97,6 +97,10 @@ static cl::opt<unsigned, false, HotColdHintParser>
 static cl::opt<unsigned, false, HotColdHintParser> HotNewHintValue(
     "hot-new-hint-value", cl::Hidden, cl::init(254),
     cl::desc("Value to pass to hot/cold operator new for hot allocation"));
+static cl::opt<unsigned, false, HotColdHintParser> AmbiguousNewHintValue(
+    "ambiguous-new-hint-value", cl::Hidden, cl::init(222),
+    cl::desc(
+        "Value to pass to hot/cold operator new for ambiguous allocation"));
 
 //===----------------------------------------------------------------------===//
 // Helper Functions
@@ -1719,6 +1723,37 @@ Value *LibCallSimplifier::optimizeRealloc(CallInst *CI, IRBuilderBase &B) {
   return nullptr;
 }
 
+// Allow existing calls to operator new() that takes a __hot_cold_t parameter to
+// be updated with a compiler-determined hot cold hint value. This is used in
+// cases where the call is marked nobuiltin (because operator new called
+// explicitly) and therefore cannot be replaced with a different callee.
+Value *LibCallSimplifier::optimizeExistingHotColdNew(CallInst *CI,
+                                                     IRBuilderBase &B) {
+  if (!OptimizeHotColdNew || !OptimizeExistingHotColdNew)
+    return nullptr;
+  Function *Callee = CI->getCalledFunction();
+  if (!Callee)
+    return nullptr;
+  LibFunc Func;
+  if (!TLI->getLibFunc(*Callee, Func))
+    return nullptr;
+  switch (Func) {
+  case LibFunc_Znwm12__hot_cold_t:
+  case LibFunc_ZnwmRKSt9nothrow_t12__hot_cold_t:
+  case LibFunc_ZnwmSt11align_val_t12__hot_cold_t:
+  case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t:
+  case LibFunc_Znam12__hot_cold_t:
+  case LibFunc_ZnamRKSt9nothrow_t12__hot_cold_t:
+  case LibFunc_ZnamSt11align_val_t12__hot_cold_t:
+  case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t:
+  case LibFunc_size_returning_new_hot_cold:
+  case LibFunc_size_returning_new_aligned_hot_cold:
+    return optimizeNew(CI, B, Func);
+  default:
+    return nullptr;
+  }
+}
+
 // When enabled, replace operator new() calls marked with a hot or cold memprof
 // attribute with an operator new() call that takes a __hot_cold_t parameter.
 // Currently this is supported by the open source version of tcmalloc, see:
@@ -1736,6 +1771,9 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B,
     HotCold = NotColdNewHintValue;
   else if (CI->getAttributes().getFnAttr("memprof").getValueAsString() == "hot")
     HotCold = HotNewHintValue;
+  else if (CI->getAttributes().getFnAttr("memprof").getValueAsString() ==
+           "ambiguous")
+    HotCold = AmbiguousNewHintValue;
   else
     return nullptr;
 
@@ -1753,9 +1791,8 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B,
                             LibFunc_Znwm12__hot_cold_t, HotCold);
     break;
   case LibFunc_Znwm:
-    if (HotCold != NotColdNewHintValue)
-      return emitHotColdNew(CI->getArgOperand(0), B, TLI,
-                            LibFunc_Znwm12__hot_cold_t, HotCold);
+    return emitHotColdNew(CI->getArgOperand(0), B, TLI,
+                          LibFunc_Znwm12__hot_cold_t, HotCold);
     break;
   case LibFunc_Znam12__hot_cold_t:
     if (OptimizeExistingHotColdNew)
@@ -1763,9 +1800,8 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B,
                             LibFunc_Znam12__hot_cold_t, HotCold);
     break;
   case LibFunc_Znam:
-    if (HotCold != NotColdNewHintValue)
-      return emitHotColdNew(CI->getArgOperand(0), B, TLI,
-                            LibFunc_Znam12__hot_cold_t, HotCold);
+    return emitHotColdNew(CI->getArgOperand(0), B, TLI,
+                          LibFunc_Znam12__hot_cold_t, HotCold);
     break;
   case LibFunc_ZnwmRKSt9nothrow_t12__hot_cold_t:
     if (OptimizeExistingHotColdNew)
@@ -1774,10 +1810,9 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B,
           LibFunc_ZnwmRKSt9nothrow_t12__hot_cold_t, HotCold);
     break;
   case LibFunc_ZnwmRKSt9nothrow_t:
-    if (HotCold != NotColdNewHintValue)
-      return emitHotColdNewNoThrow(
-          CI->getArgOperand(0), CI->getArgOperand(1), B, TLI,
-          LibFunc_ZnwmRKSt9nothrow_t12__hot_cold_t, HotCold);
+    return emitHotColdNewNoThrow(CI->getArgOperand(0), CI->getArgOperand(1), B,
+                                 TLI, LibFunc_ZnwmRKSt9nothrow_t12__hot_cold_t,
+                                 HotCold);
     break;
   case LibFunc_ZnamRKSt9nothrow_t12__hot_cold_t:
     if (OptimizeExistingHotColdNew)
@@ -1786,10 +1821,9 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B,
           LibFunc_ZnamRKSt9nothrow_t12__hot_cold_t, HotCold);
     break;
   case LibFunc_ZnamRKSt9nothrow_t:
-    if (HotCold != NotColdNewHintValue)
-      return emitHotColdNewNoThrow(
-          CI->getArgOperand(0), CI->getArgOperand(1), B, TLI,
-          LibFunc_ZnamRKSt9nothrow_t12__hot_cold_t, HotCold);
+    return emitHotColdNewNoThrow(CI->getArgOperand(0), CI->getArgOperand(1), B,
+                                 TLI, LibFunc_ZnamRKSt9nothrow_t12__hot_cold_t,
+                                 HotCold);
     break;
   case LibFunc_ZnwmSt11align_val_t12__hot_cold_t:
     if (OptimizeExistingHotColdNew)
@@ -1798,10 +1832,9 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B,
           LibFunc_ZnwmSt11align_val_t12__hot_cold_t, HotCold);
     break;
   case LibFunc_ZnwmSt11align_val_t:
-    if (HotCold != NotColdNewHintValue)
-      return emitHotColdNewAligned(
-          CI->getArgOperand(0), CI->getArgOperand(1), B, TLI,
-          LibFunc_ZnwmSt11align_val_t12__hot_cold_t, HotCold);
+    return emitHotColdNewAligned(CI->getArgOperand(0), CI->getArgOperand(1), B,
+                                 TLI, LibFunc_ZnwmSt11align_val_t12__hot_cold_t,
+                                 HotCold);
     break;
   case LibFunc_ZnamSt11align_val_t12__hot_cold_t:
     if (OptimizeExistingHotColdNew)
@@ -1810,10 +1843,9 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B,
           LibFunc_ZnamSt11align_val_t12__hot_cold_t, HotCold);
     break;
   case LibFunc_ZnamSt11align_val_t:
-    if (HotCold != NotColdNewHintValue)
-      return emitHotColdNewAligned(
-          CI->getArgOperand(0), CI->getArgOperand(1), B, TLI,
-          LibFunc_ZnamSt11align_val_t12__hot_cold_t, HotCold);
+    return emitHotColdNewAligned(CI->getArgOperand(0), CI->getArgOperand(1), B,
+                                 TLI, LibFunc_ZnamSt11align_val_t12__hot_cold_t,
+                                 HotCold);
     break;
   case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t:
     if (OptimizeExistingHotColdNew)
@@ -1823,11 +1855,9 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B,
           HotCold);
     break;
   case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t:
-    if (HotCold != NotColdNewHintValue)
-      return emitHotColdNewAlignedNoThrow(
-          CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), B,
-          TLI, LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t,
-          HotCold);
+    return emitHotColdNewAlignedNoThrow(
+        CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), B,
+        TLI, LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t, HotCold);
     break;
   case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t:
     if (OptimizeExistingHotColdNew)
@@ -1837,17 +1867,14 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B,
           HotCold);
     break;
   case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t:
-    if (HotCold != NotColdNewHintValue)
-      return emitHotColdNewAlignedNoThrow(
-          CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), B,
-          TLI, LibFunc_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t,
-          HotCold);
+    return emitHotColdNewAlignedNoThrow(
+        CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), B,
+        TLI, LibFunc_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t, HotCold);
     break;
   case LibFunc_size_returning_new:
-    if (HotCold != NotColdNewHintValue)
-      return emitHotColdSizeReturningNew(CI->getArgOperand(0), B, TLI,
-                                         LibFunc_size_returning_new_hot_cold,
-                                         HotCold);
+    return emitHotColdSizeReturningNew(CI->getArgOperand(0), B, TLI,
+                                       LibFunc_size_returning_new_hot_cold,
+                                       HotCold);
     break;
   case LibFunc_size_returning_new_hot_cold:
     if (OptimizeExistingHotColdNew)
@@ -1856,10 +1883,9 @@ Value *LibCallSimplifier::optimizeNew(CallInst *CI, IRBuilderBase &B,
                                          HotCold);
     break;
   case LibFunc_size_returning_new_aligned:
-    if (HotCold != NotColdNewHintValue)
-      return emitHotColdSizeReturningNewAligned(
-          CI->getArgOperand(0), CI->getArgOperand(1), B, TLI,
-          LibFunc_size_returning_new_aligned_hot_cold, HotCold);
+    return emitHotColdSizeReturningNewAligned(
+        CI->getArgOperand(0), CI->getArgOperand(1), B, TLI,
+        LibFunc_size_returning_new_aligned_hot_cold, HotCold);
     break;
   case LibFunc_size_returning_new_aligned_hot_cold:
     if (OptimizeExistingHotColdNew)
@@ -4094,8 +4120,11 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI, IRBuilderBase &Builder) {
   // TODO: Split out the code below that operates on FP calls so that
   //       we can all non-FP calls with the StrictFP attribute to be
   //       optimized.
-  if (CI->isNoBuiltin())
-    return nullptr;
+  if (CI->isNoBuiltin()) {
+    // If this is an existing call to a hot cold operator new, we can update the
+    // hint parameter value, which doesn't change the callee.
+    return optimizeExistingHotColdNew(CI, Builder);
+  }
 
   LibFunc Func;
   Function *Callee = CI->getCalledFunction();
diff --git a/llvm/lib/Transforms/Utils/SymbolRewriter.cpp b/llvm/lib/Transforms/Utils/SymbolRewriter.cpp
index d52d52a9b7d3..6319fd524ff0 100644
--- a/llvm/lib/Transforms/Utils/SymbolRewriter.cpp
+++ b/llvm/lib/Transforms/Utils/SymbolRewriter.cpp
@@ -349,13 +349,7 @@ parseRewriteFunctionDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
 
     KeyValue = Key->getValue(KeyStorage);
     if (KeyValue == "source") {
-      std::string Error;
-
       Source = std::string(Value->getValue(ValueStorage));
-      if (!Regex(Source).isValid(Error)) {
-        YS.printError(Field.getKey(), "invalid regex: " + Error);
-        return false;
-      }
     } else if (KeyValue == "target") {
       Target = std::string(Value->getValue(ValueStorage));
     } else if (KeyValue == "transform") {
@@ -379,12 +373,22 @@ parseRewriteFunctionDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
 
   // TODO see if there is a more elegant solution to selecting the rewrite
   // descriptor type
-  if (!Target.empty())
+  if (!Target.empty()) {
     DL->push_back(std::make_unique<ExplicitRewriteFunctionDescriptor>(
         Source, Target, Naked));
-  else
-    DL->push_back(
-        std::make_unique<PatternRewriteFunctionDescriptor>(Source, Transform));
+    return true;
+  }
+
+  {
+    std::string Error;
+    if (!Regex(Source).isValid(Error)) {
+      YS.printError(Descriptor, "invalid Source regex: " + Error);
+      return false;
+    }
+  }
+
+  DL->push_back(
+      std::make_unique<PatternRewriteFunctionDescriptor>(Source, Transform));
 
   return true;
 }
@@ -418,13 +422,7 @@ parseRewriteGlobalVariableDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
 
     KeyValue = Key->getValue(KeyStorage);
     if (KeyValue == "source") {
-      std::string Error;
-
       Source = std::string(Value->getValue(ValueStorage));
-      if (!Regex(Source).isValid(Error)) {
-        YS.printError(Field.getKey(), "invalid regex: " + Error);
-        return false;
-      }
     } else if (KeyValue == "target") {
       Target = std::string(Value->getValue(ValueStorage));
     } else if (KeyValue == "transform") {
@@ -441,13 +439,23 @@ parseRewriteGlobalVariableDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
     return false;
   }
 
-  if (!Target.empty())
+  if (!Target.empty()) {
     DL->push_back(std::make_unique<ExplicitRewriteGlobalVariableDescriptor>(
         Source, Target,
         /*Naked*/ false));
-  else
-    DL->push_back(std::make_unique<PatternRewriteGlobalVariableDescriptor>(
-        Source, Transform));
+    return true;
+  }
+
+  {
+    std::string Error;
+    if (!Regex(Source).isValid(Error)) {
+      YS.printError(Descriptor, "invalid Source regex: " + Error);
+      return false;
+    }
+  }
+
+  DL->push_back(std::make_unique<PatternRewriteGlobalVariableDescriptor>(
+      Source, Transform));
 
   return true;
 }
@@ -481,13 +489,7 @@ parseRewriteGlobalAliasDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
 
     KeyValue = Key->getValue(KeyStorage);
     if (KeyValue == "source") {
-      std::string Error;
-
       Source = std::string(Value->getValue(ValueStorage));
-      if (!Regex(Source).isValid(Error)) {
-        YS.printError(Field.getKey(), "invalid regex: " + Error);
-        return false;
-      }
     } else if (KeyValue == "target") {
       Target = std::string(Value->getValue(ValueStorage));
     } else if (KeyValue == "transform") {
@@ -504,13 +506,23 @@ parseRewriteGlobalAliasDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
     return false;
   }
 
-  if (!Target.empty())
+  if (!Target.empty()) {
     DL->push_back(std::make_unique<ExplicitRewriteNamedAliasDescriptor>(
         Source, Target,
         /*Naked*/ false));
-  else
-    DL->push_back(std::make_unique<PatternRewriteNamedAliasDescriptor>(
-        Source, Transform));
+    return true;
+  }
+
+  {
+    std::string Error;
+    if (!Regex(Source).isValid(Error)) {
+      YS.printError(Descriptor, "invalid Source regex: " + Error);
+      return false;
+    }
+  }
+
+  DL->push_back(
+      std::make_unique<PatternRewriteNamedAliasDescriptor>(Source, Transform));
 
   return true;
 }
diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index 491f0b76f4ae..53129e2e5fbb 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -170,10 +170,10 @@ private:
   bool recognizeFindFirstByte();
 
   Value *expandFindFirstByte(IRBuilder<> &Builder, DomTreeUpdater &DTU,
-                             unsigned VF, Type *CharTy, BasicBlock *ExitSucc,
-                             BasicBlock *ExitFail, Value *SearchStart,
-                             Value *SearchEnd, Value *NeedleStart,
-                             Value *NeedleEnd);
+                             unsigned VF, Type *CharTy, Value *IndPhi,
+                             BasicBlock *ExitSucc, BasicBlock *ExitFail,
+                             Value *SearchStart, Value *SearchEnd,
+                             Value *NeedleStart, Value *NeedleEnd);
 
   void transformFindFirstByte(PHINode *IndPhi, unsigned VF, Type *CharTy,
                               BasicBlock *ExitSucc, BasicBlock *ExitFail,
@@ -242,6 +242,37 @@ bool LoopIdiomVectorize::run(Loop *L) {
   return false;
 }
 
+static void fixSuccessorPhis(Loop *L, Value *ScalarRes, Value *VectorRes,
+                             BasicBlock *SuccBB, BasicBlock *IncBB) {
+  for (PHINode &PN : SuccBB->phis()) {
+    // Look through the incoming values to find ScalarRes, meaning this is a
+    // PHI collecting the results of the transformation.
+    bool ResPhi = false;
+    for (Value *Op : PN.incoming_values())
+      if (Op == ScalarRes) {
+        ResPhi = true;
+        break;
+      }
+
+    // Any PHI that depended upon the result of the transformation needs a new
+    // incoming value from IncBB.
+    if (ResPhi)
+      PN.addIncoming(VectorRes, IncBB);
+    else {
+      // There should be no other outside uses of other values in the
+      // original loop. Any incoming values should either:
+      //   1. Be for blocks outside the loop, which aren't interesting. Or ..
+      //   2. These are from blocks in the loop with values defined outside
+      //      the loop. We should a similar incoming value from CmpBB.
+      for (BasicBlock *BB : PN.blocks())
+        if (L->contains(BB)) {
+          PN.addIncoming(PN.getIncomingValueForBlock(BB), IncBB);
+          break;
+        }
+    }
+  }
+}
+
 bool LoopIdiomVectorize::recognizeByteCompare() {
   // Currently the transformation only works on scalable vector types, although
   // there is no fundamental reason why it cannot be made to work for fixed
@@ -574,13 +605,8 @@ Value *LoopIdiomVectorize::createPredicatedFindMismatch(
       Intrinsic::vp_load, {VectorLoadType, VectorLhsGep->getType()},
       {VectorRhsGep, AllTrueMask, VL}, nullptr, "rhs.load");
 
-  StringRef PredicateStr = CmpInst::getPredicateName(CmpInst::ICMP_NE);
-  auto *PredicateMDS = MDString::get(VectorLhsLoad->getContext(), PredicateStr);
-  Value *Pred = MetadataAsValue::get(VectorLhsLoad->getContext(), PredicateMDS);
-  Value *VectorMatchCmp = Builder.CreateIntrinsic(
-      Intrinsic::vp_icmp, {VectorLhsLoad->getType()},
-      {VectorLhsLoad, VectorRhsLoad, Pred, AllTrueMask, VL}, nullptr,
-      "mismatch.cmp");
+  Value *VectorMatchCmp =
+      Builder.CreateICmpNE(VectorLhsLoad, VectorRhsLoad, "mismatch.cmp");
   Value *CTZ = Builder.CreateIntrinsic(
       Intrinsic::vp_cttz_elts, {ResType, VectorMatchCmp->getType()},
       {VectorMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(false), AllTrueMask,
@@ -940,42 +966,10 @@ void LoopIdiomVectorize::transformByteCompare(GetElementPtrInst *GEPA,
     DTU.applyUpdates({{DominatorTree::Insert, CmpBB, FoundBB}});
   }
 
-  auto fixSuccessorPhis = [&](BasicBlock *SuccBB) {
-    for (PHINode &PN : SuccBB->phis()) {
-      // At this point we've already replaced all uses of the result from the
-      // loop with ByteCmp. Look through the incoming values to find ByteCmp,
-      // meaning this is a Phi collecting the results of the byte compare.
-      bool ResPhi = false;
-      for (Value *Op : PN.incoming_values())
-        if (Op == ByteCmpRes) {
-          ResPhi = true;
-          break;
-        }
-
-      // Any PHI that depended upon the result of the byte compare needs a new
-      // incoming value from CmpBB. This is because the original loop will get
-      // deleted.
-      if (ResPhi)
-        PN.addIncoming(ByteCmpRes, CmpBB);
-      else {
-        // There should be no other outside uses of other values in the
-        // original loop. Any incoming values should either:
-        //   1. Be for blocks outside the loop, which aren't interesting. Or ..
-        //   2. These are from blocks in the loop with values defined outside
-        //      the loop. We should a similar incoming value from CmpBB.
-        for (BasicBlock *BB : PN.blocks())
-          if (CurLoop->contains(BB)) {
-            PN.addIncoming(PN.getIncomingValueForBlock(BB), CmpBB);
-            break;
-          }
-      }
-    }
-  };
-
   // Ensure all Phis in the successors of CmpBB have an incoming value from it.
-  fixSuccessorPhis(EndBB);
+  fixSuccessorPhis(CurLoop, ByteCmpRes, ByteCmpRes, EndBB, CmpBB);
   if (EndBB != FoundBB)
-    fixSuccessorPhis(FoundBB);
+    fixSuccessorPhis(CurLoop, ByteCmpRes, ByteCmpRes, FoundBB, CmpBB);
 
   // The new CmpBB block isn't part of the loop, but will need to be added to
   // the outer loop if there is one.
@@ -1173,8 +1167,9 @@ bool LoopIdiomVectorize::recognizeFindFirstByte() {
 
 Value *LoopIdiomVectorize::expandFindFirstByte(
     IRBuilder<> &Builder, DomTreeUpdater &DTU, unsigned VF, Type *CharTy,
-    BasicBlock *ExitSucc, BasicBlock *ExitFail, Value *SearchStart,
-    Value *SearchEnd, Value *NeedleStart, Value *NeedleEnd) {
+    Value *IndPhi, BasicBlock *ExitSucc, BasicBlock *ExitFail,
+    Value *SearchStart, Value *SearchEnd, Value *NeedleStart,
+    Value *NeedleEnd) {
   // Set up some types and constants that we intend to reuse.
   auto *PtrTy = Builder.getPtrTy();
   auto *I64Ty = Builder.getInt64Ty();
@@ -1374,6 +1369,12 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
   MatchLCSSA->addIncoming(Search, BB2);
   MatchPredLCSSA->addIncoming(MatchPred, BB2);
 
+  // Ensure all Phis in the successors of BB3/BB5 have an incoming value from
+  // them.
+  fixSuccessorPhis(CurLoop, IndPhi, MatchVal, ExitSucc, BB3);
+  if (ExitSucc != ExitFail)
+    fixSuccessorPhis(CurLoop, IndPhi, MatchVal, ExitFail, BB5);
+
   if (VerifyLoops) {
     OuterLoop->verifyLoop();
     InnerLoop->verifyLoop();
@@ -1395,21 +1396,12 @@ void LoopIdiomVectorize::transformFindFirstByte(
   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
   Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc());
 
-  Value *MatchVal =
-      expandFindFirstByte(Builder, DTU, VF, CharTy, ExitSucc, ExitFail,
-                          SearchStart, SearchEnd, NeedleStart, NeedleEnd);
+  expandFindFirstByte(Builder, DTU, VF, CharTy, IndPhi, ExitSucc, ExitFail,
+                      SearchStart, SearchEnd, NeedleStart, NeedleEnd);
 
   assert(PHBranch->isUnconditional() &&
          "Expected preheader to terminate with an unconditional branch.");
 
-  // Add new incoming values with the result of the transformation to PHINodes
-  // of ExitSucc that use IndPhi.
-  for (auto *U : llvm::make_early_inc_range(IndPhi->users())) {
-    auto *PN = dyn_cast<PHINode>(U);
-    if (PN && PN->getParent() == ExitSucc)
-      PN->addIncoming(MatchVal, cast<Instruction>(MatchVal)->getParent());
-  }
-
   if (VerifyLoops && CurLoop->getParentLoop()) {
     CurLoop->getParentLoop()->verifyLoop();
     if (!CurLoop->getParentLoop()->isRecursivelyLCSSAForm(*DT, *LI))
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 789047a2a28e..2704e66f3a70 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -15,8 +15,10 @@
 //
 
 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MustExecute.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
@@ -1223,8 +1225,18 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
     });
   }
 
-  if (!LAI->canVectorizeMemory())
+  if (!LAI->canVectorizeMemory()) {
+    if (hasUncountableExitWithSideEffects()) {
+      reportVectorizationFailure(
+          "Cannot vectorize unsafe dependencies in uncountable exit loop with "
+          "side effects",
+          "CantVectorizeUnsafeDependencyForEELoopWithSideEffects", ORE,
+          TheLoop);
+      return false;
+    }
+
     return canVectorizeIndirectUnsafeDependences();
+  }
 
   if (LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress()) {
     reportVectorizationFailure("We don't allow storing to uniform addresses",
@@ -1530,7 +1542,8 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
             if (!isGuaranteedNotToBePoison(CurrV, AC,
                                            TheLoop->getLoopPredecessor()
                                                ->getTerminator()
-                                               ->getIterator()))
+                                               ->getIterator(),
+                                           DT))
               return false;
             continue;
           }
@@ -1754,16 +1767,24 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
     }
   };
 
+  bool HasSideEffects = false;
   for (auto *BB : TheLoop->blocks())
     for (auto &I : *BB) {
       if (I.mayWriteToMemory()) {
-        // We don't support writes to memory.
+        if (isa<StoreInst>(&I) && cast<StoreInst>(&I)->isSimple()) {
+          HasSideEffects = true;
+          continue;
+        }
+
+        // We don't support complex writes to memory.
         reportVectorizationFailure(
-            "Writes to memory unsupported in early exit loops",
-            "Cannot vectorize early exit loop with writes to memory",
+            "Complex writes to memory unsupported in early exit loops",
+            "Cannot vectorize early exit loop with complex writes to memory",
             "WritesInEarlyExitLoop", ORE, TheLoop);
         return false;
-      } else if (!IsSafeOperation(&I)) {
+      }
+
+      if (!IsSafeOperation(&I)) {
         reportVectorizationFailure("Early exit loop contains operations that "
                                    "cannot be speculatively executed",
                                    "UnsafeOperationsEarlyExitLoop", ORE,
@@ -1776,15 +1797,37 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
   assert(LatchBB->getUniquePredecessor() == SingleUncountableExitingBlock &&
          "Expected latch predecessor to be the early exiting block");
 
+  SmallVector<LoadInst *, 4> NonDerefLoads;
   // TODO: Handle loops that may fault.
-  Predicates.clear();
-  if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC,
-                                     &Predicates)) {
-    reportVectorizationFailure(
-        "Loop may fault",
-        "Cannot vectorize potentially faulting early exit loop",
-        "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
+  if (!HasSideEffects) {
+    // Read-only loop.
+    Predicates.clear();
+    if (!isReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC, NonDerefLoads,
+                        &Predicates)) {
+      reportVectorizationFailure(
+          "Loop may fault", "Cannot vectorize non-read-only early exit loop",
+          "NonReadOnlyEarlyExitLoop", ORE, TheLoop);
+      return false;
+    }
+  } else if (!canUncountableExitConditionLoadBeMoved(
+                 SingleUncountableExitingBlock))
     return false;
+
+  // Check non-dereferenceable loads if any.
+  for (LoadInst *LI : NonDerefLoads) {
+    // Only support unit-stride access for now.
+    int Stride = isConsecutivePtr(LI->getType(), LI->getPointerOperand());
+    if (Stride != 1) {
+      reportVectorizationFailure(
+          "Loop contains potentially faulting strided load",
+          "Cannot vectorize early exit loop with "
+          "strided fault-only-first load",
+          "EarlyExitLoopWithStridedFaultOnlyFirstLoad", ORE, TheLoop);
+      return false;
+    }
+    PotentiallyFaultingLoads.insert(LI);
+    LLVM_DEBUG(dbgs() << "LV: Found potentially faulting load: " << *LI
+                      << "\n");
   }
 
   [[maybe_unused]] const SCEV *SymbolicMaxBTC =
@@ -1797,6 +1840,99 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
                        "backedge taken count: "
                     << *SymbolicMaxBTC << '\n');
   UncountableExitingBB = SingleUncountableExitingBlock;
+  UncountableExitWithSideEffects = HasSideEffects;
+  return true;
+}
+
+bool LoopVectorizationLegality::canUncountableExitConditionLoadBeMoved(
+    BasicBlock *ExitingBlock) {
+  // Try to find a load in the critical path for the uncountable exit condition.
+  // This is currently matching about the simplest form we can, expecting
+  // only one in-loop load, the result of which is directly compared against
+  // a loop-invariant value.
+  // FIXME: We're insisting on a single use for now, because otherwise we will
+  // need to make PHI nodes for other users. That can be done once the initial
+  // transform code lands.
+  auto *Br = cast<BranchInst>(ExitingBlock->getTerminator());
+
+  using namespace llvm::PatternMatch;
+  Instruction *L = nullptr;
+  Value *Ptr = nullptr;
+  Value *R = nullptr;
+  if (!match(Br->getCondition(),
+             m_OneUse(m_ICmp(m_OneUse(m_Instruction(L, m_Load(m_Value(Ptr)))),
+                             m_Value(R))))) {
+    reportVectorizationFailure(
+        "Early exit loop with store but no supported condition load",
+        "NoConditionLoadForEarlyExitLoop", ORE, TheLoop);
+    return false;
+  }
+
+  // FIXME: Don't rely on operand ordering for the comparison.
+  if (!TheLoop->isLoopInvariant(R)) {
+    reportVectorizationFailure(
+        "Early exit loop with store but no supported condition load",
+        "NoConditionLoadForEarlyExitLoop", ORE, TheLoop);
+    return false;
+  }
+
+  // Make sure that the load address is not loop invariant; we want an
+  // address calculation that we can rotate to the next vector iteration.
+  const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
+  if (!isa<SCEVAddRecExpr>(PtrScev)) {
+    reportVectorizationFailure(
+        "Uncountable exit condition depends on load with an address that is "
+        "not an add recurrence",
+        "EarlyExitLoadInvariantAddress", ORE, TheLoop);
+    return false;
+  }
+
+  // FIXME: Support gathers after first-faulting load support lands.
+  SmallVector<const SCEVPredicate *, 4> Predicates;
+  LoadInst *Load = cast<LoadInst>(L);
+  if (!isDereferenceableAndAlignedInLoop(Load, TheLoop, *PSE.getSE(), *DT, AC,
+                                         &Predicates)) {
+    reportVectorizationFailure(
+        "Loop may fault",
+        "Cannot vectorize potentially faulting early exit loop",
+        "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
+    return false;
+  }
+
+  ICFLoopSafetyInfo SafetyInfo;
+  SafetyInfo.computeLoopSafetyInfo(TheLoop);
+  // We need to know that load will be executed before we can hoist a
+  // copy out to run just before the first iteration.
+  // FIXME: Currently, other restrictions prevent us from reaching this point
+  //        with a loop where the uncountable exit condition is determined
+  //        by a conditional load.
+  assert(SafetyInfo.isGuaranteedToExecute(*Load, DT, TheLoop) &&
+         "Unhandled control flow in uncountable exit loop with side effects");
+
+  // Prohibit any potential aliasing with any instruction in the loop which
+  // might store to memory.
+  // FIXME: Relax this constraint where possible.
+  for (auto *BB : TheLoop->blocks()) {
+    for (auto &I : *BB) {
+      if (&I == Load)
+        continue;
+
+      if (I.mayWriteToMemory()) {
+        if (auto *SI = dyn_cast<StoreInst>(&I)) {
+          AliasResult AR = AA->alias(Ptr, SI->getPointerOperand());
+          if (AR == AliasResult::NoAlias)
+            continue;
+        }
+
+        reportVectorizationFailure(
+            "Cannot determine whether critical uncountable exit load address "
+            "does not alias with a memory write",
+            "CantVectorizeAliasWithCriticalUncountableExitLoad", ORE, TheLoop);
+        return false;
+      }
+    }
+  }
+
   return true;
 }
 
@@ -1869,6 +2005,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
     } else {
       if (!isVectorizableEarlyExitLoop()) {
         assert(!hasUncountableEarlyExit() &&
+               !hasUncountableExitWithSideEffects() &&
                "Must be false without vectorizable early-exit loop");
         if (DoExtraAnalysis)
           Result = false;
@@ -1887,6 +2024,15 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
       return false;
   }
 
+  // Bail out for state-changing loops with uncountable exits for now.
+  if (UncountableExitWithSideEffects) {
+    reportVectorizationFailure(
+        "Writes to memory unsupported in early exit loops",
+        "Cannot vectorize early exit loop with writes to memory",
+        "WritesInEarlyExitLoop", ORE, TheLoop);
+    return false;
+  }
+
   if (Result) {
     LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop"
                       << (LAI->getRuntimePointerChecking()->Need
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 838476dcae66..d34d2ae7a0b3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -334,6 +334,10 @@ public:
         FPBinOp ? FPBinOp->getFastMathFlags() : FastMathFlags(), DL));
   }
 
+  VPExpandSCEVRecipe *createExpandSCEV(const SCEV *Expr) {
+    return tryInsertInstruction(new VPExpandSCEVRecipe(Expr));
+  }
+
   //===--------------------------------------------------------------------===//
   // RAII helpers.
   //===--------------------------------------------------------------------===//
@@ -559,6 +563,20 @@ public:
   /// Emit remarks for recipes with invalid costs in the available VPlans.
   void emitInvalidCostRemarks(OptimizationRemarkEmitter *ORE);
 
+  /// Create a check to \p Plan to see if the vector loop should be executed
+  /// based on its trip count.
+  void addMinimumIterationCheck(VPlan &Plan, ElementCount VF, unsigned UF,
+                                ElementCount MinProfitableTripCount) const;
+
+  /// Update loop metadata and profile info for both the scalar remainder loop
+  /// and \p VectorLoop, if it exists. Keeps all loop hints from the original
+  /// loop on the vector loop and replaces vectorizer-specific metadata.
+  void updateLoopMetadataAndProfileInfo(Loop *VectorLoop,
+                                        VPBasicBlock *HeaderVPBB,
+                                        bool VectorizingEpilogue,
+                                        unsigned EstimatedVFxUF,
+                                        bool DisableRuntimeUnroll);
+
 protected:
   /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
   /// according to the information gathered by Legal when it checked if it is
@@ -613,13 +631,15 @@ private:
   /// Returns true if the per-lane cost of VectorizationFactor A is lower than
   /// that of B.
   bool isMoreProfitable(const VectorizationFactor &A,
-                        const VectorizationFactor &B, bool HasTail) const;
+                        const VectorizationFactor &B, bool HasTail,
+                        bool IsEpilogue = false) const;
 
   /// Returns true if the per-lane cost of VectorizationFactor A is lower than
   /// that of B in the context of vectorizing a loop with known \p MaxTripCount.
   bool isMoreProfitable(const VectorizationFactor &A,
                         const VectorizationFactor &B,
-                        const unsigned MaxTripCount, bool HasTail) const;
+                        const unsigned MaxTripCount, bool HasTail,
+                        bool IsEpilogue = false) const;
 
   /// Determines if we have the infrastructure to vectorize the loop and its
   /// epilogue, assuming the main loop is vectorized by \p VF.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a0f306c12754..3cff43a51029 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -165,15 +165,6 @@ using namespace SCEVPatternMatch;
 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
 #endif
 
-/// @{
-/// Metadata attribute names
-const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
-const char LLVMLoopVectorizeFollowupVectorized[] =
-    "llvm.loop.vectorize.followup_vectorized";
-const char LLVMLoopVectorizeFollowupEpilogue[] =
-    "llvm.loop.vectorize.followup_epilogue";
-/// @}
-
 STATISTIC(LoopsVectorized, "Number of loops vectorized");
 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
@@ -500,26 +491,22 @@ public:
   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
                       LoopInfo *LI, DominatorTree *DT,
                       const TargetTransformInfo *TTI, AssumptionCache *AC,
-                      ElementCount VecWidth,
-                      ElementCount MinProfitableTripCount,
-                      unsigned UnrollFactor, LoopVectorizationCostModel *CM,
-                      BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
-                      GeneratedRTChecks &RTChecks, VPlan &Plan)
+                      ElementCount VecWidth, unsigned UnrollFactor,
+                      LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
+                      ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks,
+                      VPlan &Plan)
       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TTI(TTI), AC(AC),
-        VF(VecWidth), MinProfitableTripCount(MinProfitableTripCount),
-        UF(UnrollFactor), Builder(PSE.getSE()->getContext()), Cost(CM),
-        BFI(BFI), PSI(PSI), RTChecks(RTChecks), Plan(Plan),
+        VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()),
+        Cost(CM), BFI(BFI), PSI(PSI), RTChecks(RTChecks), Plan(Plan),
         VectorPHVPBB(cast<VPBasicBlock>(
             Plan.getVectorLoopRegion()->getSinglePredecessor())) {}
 
   virtual ~InnerLoopVectorizer() = default;
 
-  /// Create a new empty loop that will contain vectorized instructions later
-  /// on, while the old loop will be used as the scalar remainder. Control flow
-  /// is generated around the vectorized (and scalar epilogue) loops consisting
-  /// of various checks and bypasses. Return the pre-header block of the new
-  /// loop. In the case of epilogue vectorization, this function is overriden to
-  /// handle the more complex control flow around the loops.
+  /// Creates a basic block for the scalar preheader. Both
+  /// EpilogueVectorizerMainLoop and EpilogueVectorizerEpilogueLoop overwrite
+  /// the method to create additional blocks and checks needed for epilogue
+  /// vectorization.
   virtual BasicBlock *createVectorizedLoopSkeleton();
 
   /// Fix the vectorized code, taking care of header phi's, and more.
@@ -536,38 +523,18 @@ public:
   /// count of the original loop for both main loop and epilogue vectorization.
   void setTripCount(Value *TC) { TripCount = TC; }
 
-  /// Return the additional bypass block which targets the scalar loop by
-  /// skipping the epilogue loop after completing the main loop.
-  BasicBlock *getAdditionalBypassBlock() const {
-    assert(AdditionalBypassBlock &&
-           "Trying to access AdditionalBypassBlock but it has not been set");
-    return AdditionalBypassBlock;
-  }
-
 protected:
   friend class LoopVectorizationPlanner;
 
-  // Create a check to see if the vector loop should be executed
-  Value *createIterationCountCheck(ElementCount VF, unsigned UF) const;
-
-  /// Emit a bypass check to see if the vector trip count is zero, including if
-  /// it overflows.
-  void emitIterationCountCheck(BasicBlock *Bypass);
-
-  /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
-  /// vector loop preheader, middle block and scalar preheader.
-  void createVectorLoopSkeleton(StringRef Prefix);
+  /// Create and return a new IR basic block for the scalar preheader whose name
+  /// is prefixed with \p Prefix.
+  BasicBlock *createScalarPreheader(StringRef Prefix);
 
   /// Allow subclasses to override and print debug traces before/after vplan
   /// execution, when trace information is requested.
   virtual void printDebugTracesAtStart() {}
   virtual void printDebugTracesAtEnd() {}
 
-  /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
-  /// vector preheader and its predecessor, also connecting the new block to the
-  /// scalar preheader.
-  void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
-
   /// The original loop.
   Loop *OrigLoop;
 
@@ -592,8 +559,6 @@ protected:
   /// vector elements.
   ElementCount VF;
 
-  ElementCount MinProfitableTripCount;
-
   /// The vectorization unroll factor to use. Each scalar is vectorized to this
   /// many different vector instructions.
   unsigned UF;
@@ -603,18 +568,9 @@ protected:
 
   // --- Vectorization state ---
 
-  /// The vector-loop preheader.
-  BasicBlock *LoopVectorPreHeader = nullptr;
-
-  /// The scalar-loop preheader.
-  BasicBlock *LoopScalarPreHeader = nullptr;
-
   /// Trip count of the original loop.
   Value *TripCount = nullptr;
 
-  /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
-  Value *VectorTripCount = nullptr;
-
   /// The profitablity analysis.
   LoopVectorizationCostModel *Cost;
 
@@ -626,11 +582,6 @@ protected:
   /// for cleaning the checks, if vectorization turns out unprofitable.
   GeneratedRTChecks &RTChecks;
 
-  /// The additional bypass block which conditionally skips over the epilogue
-  /// loop after executing the main loop. Needed to resume inductions and
-  /// reductions during epilogue vectorization.
-  BasicBlock *AdditionalBypassBlock = nullptr;
-
   VPlan &Plan;
 
   /// The vector preheader block of \p Plan, used as target for check blocks
@@ -679,20 +630,8 @@ public:
       GeneratedRTChecks &Checks, VPlan &Plan, ElementCount VecWidth,
       ElementCount MinProfitableTripCount, unsigned UnrollFactor)
       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, VecWidth,
-                            MinProfitableTripCount, UnrollFactor, CM, BFI, PSI,
-                            Checks, Plan),
-        EPI(EPI) {}
-
-  // Override this function to handle the more complex control flow around the
-  // three loops.
-  BasicBlock *createVectorizedLoopSkeleton() final {
-    return createEpilogueVectorizedLoopSkeleton();
-  }
-
-  /// The interface for creating a vectorized skeleton using one of two
-  /// different strategies, each corresponding to one execution of the vplan
-  /// as described above.
-  virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
+                            UnrollFactor, CM, BFI, PSI, Checks, Plan),
+        EPI(EPI), MinProfitableTripCount(MinProfitableTripCount) {}
 
   /// Holds and updates state information required to vectorize the main loop
   /// and its epilogue in two separate passes. This setup helps us avoid
@@ -701,6 +640,9 @@ public:
   /// iteration count of the loop is so small that the main vector loop is
   /// completely skipped.
   EpilogueLoopVectorizationInfo &EPI;
+
+protected:
+  ElementCount MinProfitableTripCount;
 };
 
 /// A specialized derived class of inner loop vectorizer that performs
@@ -720,14 +662,24 @@ public:
                                        BFI, PSI, Check, Plan, EPI.MainLoopVF,
                                        EPI.MainLoopVF, EPI.MainLoopUF) {}
   /// Implements the interface for creating a vectorized skeleton using the
-  /// *main loop* strategy (ie the first pass of vplan execution).
-  BasicBlock *createEpilogueVectorizedLoopSkeleton() final;
+  /// *main loop* strategy (i.e., the first pass of VPlan execution).
+  BasicBlock *createVectorizedLoopSkeleton() final;
 
 protected:
+  /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
+  /// vector preheader and its predecessor, also connecting the new block to the
+  /// scalar preheader.
+  void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
+
+  // Create a check to see if the main vector loop should be executed
+  Value *createIterationCountCheck(BasicBlock *VectorPH, ElementCount VF,
+                                   unsigned UF) const;
+
   /// Emits an iteration count bypass check once for the main loop (when \p
   /// ForEpilogue is false) and once for the epilogue loop (when \p
   /// ForEpilogue is true).
-  BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
+  BasicBlock *emitIterationCountCheck(BasicBlock *VectorPH, BasicBlock *Bypass,
+                                      bool ForEpilogue);
   void printDebugTracesAtStart() override;
   void printDebugTracesAtEnd() override;
 };
@@ -736,6 +688,11 @@ protected:
 // vectorization of *epilogue* loops in the process of vectorizing loops and
 // their epilogues.
 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
+  /// The additional bypass block which conditionally skips over the epilogue
+  /// loop after executing the main loop. Needed to resume inductions and
+  /// reductions during epilogue vectorization.
+  BasicBlock *AdditionalBypassBlock = nullptr;
+
 public:
   EpilogueVectorizerEpilogueLoop(
       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
@@ -749,14 +706,22 @@ public:
     TripCount = EPI.TripCount;
   }
   /// Implements the interface for creating a vectorized skeleton using the
-  /// *epilogue loop* strategy (ie the second pass of vplan execution).
-  BasicBlock *createEpilogueVectorizedLoopSkeleton() final;
+  /// *epilogue loop* strategy (i.e., the second pass of VPlan execution).
+  BasicBlock *createVectorizedLoopSkeleton() final;
+
+  /// Return the additional bypass block which targets the scalar loop by
+  /// skipping the epilogue loop after completing the main loop.
+  BasicBlock *getAdditionalBypassBlock() const {
+    assert(AdditionalBypassBlock &&
+           "Trying to access AdditionalBypassBlock but it has not been set");
+    return AdditionalBypassBlock;
+  }
 
 protected:
   /// Emits an iteration count bypass check after the main vector loop has
   /// finished to see if there are any iterations left to execute by either
   /// the vector epilogue or the scalar epilogue.
-  BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
+  BasicBlock *emitMinimumVectorEpilogueIterCountCheck(BasicBlock *VectorPH,
                                                       BasicBlock *Bypass,
                                                       BasicBlock *Insert);
   void printDebugTracesAtStart() override;
@@ -962,8 +927,8 @@ public:
   /// user options, for the given register kind.
   bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind);
 
-  /// \return True if register pressure should be calculated for the given VF.
-  bool shouldCalculateRegPressureForVF(ElementCount VF);
+  /// \return True if register pressure should be considered for the given VF.
+  bool shouldConsiderRegPressureForVF(ElementCount VF);
 
   /// \return The size (in bits) of the smallest and widest types in the code
   /// that needs to be vectorized. We ignore values that remain scalar such as
@@ -1159,7 +1124,10 @@ public:
   CallWideningDecision getCallWideningDecision(CallInst *CI,
                                                ElementCount VF) const {
     assert(!VF.isScalar() && "Expected vector VF");
-    return CallWideningDecisions.at({CI, VF});
+    auto I = CallWideningDecisions.find({CI, VF});
+    if (I == CallWideningDecisions.end())
+      return {CM_Unknown, nullptr, Intrinsic::not_intrinsic, std::nullopt, 0};
+    return I->second;
   }
 
   /// Return True if instruction \p I is an optimizable truncate whose operand
@@ -1682,7 +1650,9 @@ private:
     Instruction *I = dyn_cast<Instruction>(V);
     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
         TheLoop->isLoopInvariant(I) ||
-        getWideningDecision(I, VF) == CM_Scalarize)
+        getWideningDecision(I, VF) == CM_Scalarize ||
+        (isa<CallInst>(I) &&
+         getCallWideningDecision(cast<CallInst>(I), VF).Kind == CM_Scalarize))
       return false;
 
     // Assume we can vectorize V (and hence we need extraction) if the
@@ -1878,6 +1848,8 @@ public:
              "claimed checks are required");
     }
 
+    SCEVExp.eraseDeadInstructions(SCEVCheckCond);
+
     if (!MemCheckBlock && !SCEVCheckBlock)
       return;
 
@@ -2030,7 +2002,7 @@ public:
 
   /// Retrieves the SCEVCheckCond and SCEVCheckBlock that were generated as IR
   /// outside VPlan.
-  std::pair<Value *, BasicBlock *> getSCEVChecks() {
+  std::pair<Value *, BasicBlock *> getSCEVChecks() const {
     using namespace llvm::PatternMatch;
     if (!SCEVCheckCond || match(SCEVCheckCond, m_ZeroInt()))
       return {nullptr, nullptr};
@@ -2040,7 +2012,7 @@ public:
 
   /// Retrieves the MemCheckCond and MemCheckBlock that were generated as IR
   /// outside VPlan.
-  std::pair<Value *, BasicBlock *> getMemRuntimeChecks() {
+  std::pair<Value *, BasicBlock *> getMemRuntimeChecks() const {
     using namespace llvm::PatternMatch;
     if (MemRuntimeCheckCond && match(MemRuntimeCheckCond, m_ZeroInt()))
       return {nullptr, nullptr};
@@ -2049,9 +2021,7 @@ public:
 
   /// Return true if any runtime checks have been added
   bool hasChecks() const {
-    using namespace llvm::PatternMatch;
-    return (SCEVCheckCond && !match(SCEVCheckCond, m_ZeroInt())) ||
-           MemRuntimeCheckCond;
+    return getSCEVChecks().first || getMemRuntimeChecks().first;
   }
 };
 } // namespace
@@ -2276,7 +2246,8 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
   return TTI.enableMaskedInterleavedAccessVectorization();
 }
 
-void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
+void EpilogueVectorizerMainLoop::introduceCheckBlockInVPlan(
+    BasicBlock *CheckIRBB) {
   // Note: The block with the minimum trip-count check is already connected
   // during earlier VPlan construction.
   VPBlockBase *ScalarPH = Plan.getScalarPreheader();
@@ -2300,8 +2271,8 @@ void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
   }
 }
 
-Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF,
-                                                      unsigned UF) const {
+Value *EpilogueVectorizerMainLoop::createIterationCountCheck(
+    BasicBlock *VectorPH, ElementCount VF, unsigned UF) const {
   // Generate code to check if the loop's trip count is less than VF * UF, or
   // equal to it in case a scalar epilogue is required; this implies that the
   // vector trip count is zero. This check also covers the case where adding one
@@ -2312,7 +2283,7 @@ Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF,
 
   // Reuse existing vector loop preheader for TC checks.
   // Note that new preheader block is generated for vector loop.
-  BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
+  BasicBlock *const TCCheckBlock = VectorPH;
   IRBuilder<InstSimplifyFolder> Builder(
       TCCheckBlock->getContext(),
       InstSimplifyFolder(TCCheckBlock->getDataLayout()));
@@ -2371,25 +2342,6 @@ Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF,
   return CheckMinIters;
 }
 
-void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
-  BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
-  Value *CheckMinIters = createIterationCountCheck(VF, UF);
-  // Create new preheader for vector loop.
-  LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
-                                   static_cast<DominatorTree *>(nullptr), LI,
-                                   nullptr, "vector.ph");
-
-  BranchInst &BI =
-      *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
-  if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
-    setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
-  ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
-
-  assert(cast<VPIRBasicBlock>(Plan.getEntry())->getIRBasicBlock() ==
-             TCCheckBlock &&
-         "Plan's entry must be TCCCheckBlock");
-}
-
 /// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
 /// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
 /// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
@@ -2410,20 +2362,19 @@ static VPIRBasicBlock *replaceVPBBWithIRVPBB(VPBasicBlock *VPBB,
   return IRVPBB;
 }
 
-void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
-  LoopVectorPreHeader = OrigLoop->getLoopPreheader();
-  assert(LoopVectorPreHeader && "Invalid loop structure");
+BasicBlock *InnerLoopVectorizer::createScalarPreheader(StringRef Prefix) {
+  BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
+  assert(VectorPH && "Invalid loop structure");
   assert((OrigLoop->getUniqueLatchExitBlock() ||
           Cost->requiresScalarEpilogue(VF.isVector())) &&
          "loops not exiting via the latch without required epilogue?");
 
-  LoopScalarPreHeader =
-      SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
-                 LI, nullptr, Twine(Prefix) + "scalar.ph");
   // NOTE: The Plan's scalar preheader VPBB isn't replaced with a VPIRBasicBlock
-  // wrapping LoopScalarPreHeader here at the moment, because the Plan's scalar
-  // preheader may be unreachable at this point. Instead it is replaced in
-  // createVectorizedLoopSkeleton.
+  // wrapping the newly created scalar preheader here at the moment, because the
+  // Plan's scalar preheader may be unreachable at this point. Instead it is
+  // replaced in executePlan.
+  return SplitBlock(VectorPH, VectorPH->getTerminator(), DT, LI, nullptr,
+                    Twine(Prefix) + "scalar.ph");
 }
 
 /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
@@ -2464,54 +2415,9 @@ static void addFullyUnrolledInstructionsToIgnore(
 }
 
 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
-  /*
-   In this function we generate a new loop. The new loop will contain
-   the vectorized instructions while the old loop will continue to run the
-   scalar remainder.
-
-       [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
-     /  |      preheader are expanded here. Eventually all required SCEV
-    /   |      expansion should happen here.
-   /    v
-  |    [ ] <-- vector loop bypass (may consist of multiple blocks).
-  |  /  |
-  | /   v
-  ||   [ ]     <-- vector pre header.
-  |/    |
-  |     v
-  |    [  ] \
-  |    [  ]_|   <-- vector loop (created during VPlan execution).
-  |     |
-  |     v
-  \   -[ ]   <--- middle-block (wrapped in VPIRBasicBlock with the branch to
-   |    |                       successors created during VPlan execution)
-   \/   |
-   /\   v
-   | ->[ ]     <--- new preheader (wrapped in VPIRBasicBlock).
-   |    |
- (opt)  v      <-- edge from middle to exit iff epilogue is not required.
-   |   [ ] \
-   |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue, header
-   |    |          wrapped in VPIRBasicBlock).
-    \   |
-     \  v
-      >[ ]     <-- exit block(s). (wrapped in VPIRBasicBlock)
-   ...
-   */
-
-  // Create an empty vector loop, and prepare basic blocks for the runtime
-  // checks.
-  createVectorLoopSkeleton("");
-
-  // Now, compare the new count to zero. If it is zero skip the vector loop and
-  // jump to the scalar loop. This check also covers the case where the
-  // backedge-taken count is uint##_max: adding one to it will overflow leading
-  // to an incorrect trip count of zero. In this (rare) case we will also jump
-  // to the scalar loop.
-  emitIterationCountCheck(LoopScalarPreHeader);
-
-  replaceVPBBWithIRVPBB(VectorPHVPBB, LoopVectorPreHeader);
-  return LoopVectorPreHeader;
+  // Create a new IR basic block for the scalar preheader.
+  BasicBlock *ScalarPH = createScalarPreheader("");
+  return ScalarPH->getSinglePredecessor();
 }
 
 namespace {
@@ -2652,24 +2558,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
 
   // Remove redundant induction instructions.
   cse(HeaderBB);
-
-  // Set/update profile weights for the vector and remainder loops as original
-  // loop iterations are now distributed among them. Note that original loop
-  // becomes the scalar remainder loop after vectorization.
-  //
-  // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
-  // end up getting slightly roughened result but that should be OK since
-  // profile is not inherently precise anyway. Note also possible bypass of
-  // vector code caused by legality checks is ignored, assigning all the weight
-  // to the vector loop, optimistically.
-  //
-  // For scalable vectorization we can't know at compile time how many
-  // iterations of the loop are handled in one vector iteration, so instead
-  // use the value of vscale used for tuning.
-  Loop *VectorLoop = LI->getLoopFor(HeaderBB);
-  unsigned EstimatedVFxUF =
-      estimateElementCount(VF * UF, Cost->getVScaleForTuning());
-  setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop, EstimatedVFxUF);
 }
 
 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
@@ -3020,19 +2908,12 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
                              toVectorTy(Type::getInt1Ty(I->getContext()), VF),
                              CmpInst::BAD_ICMP_PREDICATE, CostKind);
 
-  // Certain instructions can be cheaper to vectorize if they have a constant
-  // second vector operand. One example of this are shifts on x86.
-  Value *Op2 = I->getOperand(1);
-  auto Op2Info = TTI.getOperandInfo(Op2);
-  if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
-      Legal->isInvariant(Op2))
-    Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
-
   SmallVector<const Value *, 4> Operands(I->operand_values());
   SafeDivisorCost += TTI.getArithmeticInstrCost(
-    I->getOpcode(), VecTy, CostKind,
-    {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
-    Op2Info, Operands, I);
+      I->getOpcode(), VecTy, CostKind,
+      {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
+      {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
+      Operands, I);
   return {ScalarizationCost, SafeDivisorCost};
 }
 
@@ -3810,7 +3691,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
   return FixedScalableVFPair::getNone();
 }
 
-bool LoopVectorizationCostModel::shouldCalculateRegPressureForVF(
+bool LoopVectorizationCostModel::shouldConsiderRegPressureForVF(
     ElementCount VF) {
   if (!useMaxBandwidth(VF.isScalable()
                            ? TargetTransformInfo::RGK_ScalableVector
@@ -3939,7 +3820,8 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
 bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
                                                 const VectorizationFactor &B,
                                                 const unsigned MaxTripCount,
-                                                bool HasTail) const {
+                                                bool HasTail,
+                                                bool IsEpilogue) const {
   InstructionCost CostA = A.Cost;
   InstructionCost CostB = B.Cost;
 
@@ -3963,7 +3845,7 @@ bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
   // Assume vscale may be larger than 1 (or the value being tuned for),
   // so that scalable vectorization is slightly favorable over fixed-width
   // vectorization.
-  bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&
+  bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost(IsEpilogue) &&
                         A.Width.isScalable() && !B.Width.isScalable();
 
   auto CmpFn = [PreferScalable](const InstructionCost &LHS,
@@ -4001,10 +3883,11 @@ bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
 
 bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
                                                 const VectorizationFactor &B,
-                                                bool HasTail) const {
+                                                bool HasTail,
+                                                bool IsEpilogue) const {
   const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
-  return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount,
-                                                    HasTail);
+  return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount, HasTail,
+                                                    IsEpilogue);
 }
 
 void LoopVectorizationPlanner::emitInvalidCostRemarks(
@@ -4171,6 +4054,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
       case VPDef::VPWidenIntOrFpInductionSC:
       case VPDef::VPWidenPointerInductionSC:
       case VPDef::VPReductionPHISC:
+      case VPDef::VPInterleaveEVLSC:
       case VPDef::VPInterleaveSC:
       case VPDef::VPWidenLoadEVLSC:
       case VPDef::VPWidenLoadSC:
@@ -4199,8 +4083,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
 
       // If no def nor is a store, e.g., branches, continue - no value to check.
       if (R.getNumDefinedValues() == 0 &&
-          !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
-              &R))
+          !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveBase>(&R))
         continue;
       // For multi-def recipes, currently only interleaved loads, suffice to
       // check first def only.
@@ -4255,8 +4138,9 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
                                P->vectorFactors().end());
 
     SmallVector<VPRegisterUsage, 8> RUs;
-    if (CM.useMaxBandwidth(TargetTransformInfo::RGK_ScalableVector) ||
-        CM.useMaxBandwidth(TargetTransformInfo::RGK_FixedWidthVector))
+    if (any_of(VFs, [this](ElementCount VF) {
+          return CM.shouldConsiderRegPressureForVF(VF);
+        }))
       RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
 
     for (unsigned I = 0; I < VFs.size(); I++) {
@@ -4268,7 +4152,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
       /// If the register pressure needs to be considered for VF,
       /// don't consider the VF as valid if it exceeds the number
       /// of registers for the target.
-      if (CM.shouldCalculateRegPressureForVF(VF) &&
+      if (CM.shouldConsiderRegPressureForVF(VF) &&
           RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs))
         continue;
 
@@ -4286,7 +4170,33 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
           if (!VPI)
             continue;
           switch (VPI->getOpcode()) {
-          case VPInstruction::ActiveLaneMask:
+          // Selects are only modelled in the legacy cost model for safe
+          // divisors.
+          case Instruction::Select: {
+            VPValue *VPV = VPI->getVPSingleValue();
+            if (VPV->getNumUsers() == 1) {
+              if (auto *WR = dyn_cast<VPWidenRecipe>(*VPV->user_begin())) {
+                switch (WR->getOpcode()) {
+                case Instruction::UDiv:
+                case Instruction::SDiv:
+                case Instruction::URem:
+                case Instruction::SRem:
+                  continue;
+                default:
+                  break;
+                }
+              }
+            }
+            C += VPI->cost(VF, CostCtx);
+            break;
+          }
+          case VPInstruction::ActiveLaneMask: {
+            unsigned Multiplier =
+                cast<ConstantInt>(VPI->getOperand(2)->getLiveInIRValue())
+                    ->getZExtValue();
+            C += VPI->cost(VF * Multiplier, CostCtx);
+            break;
+          }
           case VPInstruction::ExplicitVectorLength:
             C += VPI->cost(VF, CostCtx);
             break;
@@ -4511,7 +4421,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
     }
 
     if (Result.Width.isScalar() ||
-        isMoreProfitable(NextVF, Result, MaxTripCount, !CM.foldTailByMasking()))
+        isMoreProfitable(NextVF, Result, MaxTripCount, !CM.foldTailByMasking(),
+                         /*IsEpilogue*/ true))
       Result = NextVF;
   }
 
@@ -5326,8 +5237,11 @@ LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
   Type *ValTy = getLoadStoreType(I);
   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
   const Align Alignment = getLoadStoreAlignment(I);
-  const Value *Ptr = getLoadStorePointerOperand(I);
-  Type *PtrTy = toVectorTy(Ptr->getType(), VF);
+  Value *Ptr = getLoadStorePointerOperand(I);
+  Type *PtrTy = Ptr->getType();
+
+  if (!Legal->isUniform(Ptr, VF))
+    PtrTy = toVectorTy(PtrTy, VF);
 
   return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
@@ -5483,7 +5397,8 @@ LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
                              TTI::CastContextHint::None, CostKind, RedOp);
 
     InstructionCost RedCost = TTI.getMulAccReductionCost(
-        IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
+        IsUnsigned, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), ExtType,
+        CostKind);
 
     if (RedCost.isValid() &&
         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
@@ -5528,7 +5443,8 @@ LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
 
       InstructionCost RedCost = TTI.getMulAccReductionCost(
-          IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
+          IsUnsigned, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), ExtType,
+          CostKind);
       InstructionCost ExtraExtCost = 0;
       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
@@ -5547,7 +5463,8 @@ LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
 
       InstructionCost RedCost = TTI.getMulAccReductionCost(
-          true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
+          true, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), VectorTy,
+          CostKind);
 
       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
         return I == RetI ? RedCost : 0;
@@ -6262,10 +6179,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
               Op1->getType()->getScalarSizeInBits() == 1);
 
-      SmallVector<const Value *, 2> Operands{Op0, Op1};
       return TTI.getArithmeticInstrCost(
-          match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
-          CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
+          match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And,
+          VectorTy, CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, {Op0, Op1}, I);
     }
 
     Type *CondTy = SI->getCondition()->getType();
@@ -6495,7 +6411,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
         }))
       continue;
     VecValuesToIgnore.insert(Op);
-    DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end());
+    append_range(DeadInterleavePointerOps, Op->operands());
   }
 
   for (const auto &[_, Ops] : DeadInvariantStoreOps)
@@ -6555,7 +6471,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
       ValuesToIgnore.insert(Op);
 
     VecValuesToIgnore.insert(Op);
-    DeadOps.append(Op->op_begin(), Op->op_end());
+    append_range(DeadOps, Op->operands());
   }
 
   // Ignore type-promoting instructions we identified during reduction
@@ -6765,9 +6681,10 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
 
 InstructionCost VPCostContext::getLegacyCost(Instruction *UI,
                                              ElementCount VF) const {
-  if (ForceTargetInstructionCost.getNumOccurrences())
-    return InstructionCost(ForceTargetInstructionCost.getNumOccurrences());
-  return CM.getInstructionCost(UI, VF);
+  InstructionCost Cost = CM.getInstructionCost(UI, VF);
+  if (Cost.isValid() && ForceTargetInstructionCost.getNumOccurrences())
+    return InstructionCost(ForceTargetInstructionCost);
+  return Cost;
 }
 
 bool VPCostContext::isLegacyUniformAfterVectorization(Instruction *I,
@@ -7071,8 +6988,9 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
                                P->vectorFactors().end());
 
     SmallVector<VPRegisterUsage, 8> RUs;
-    if (CM.useMaxBandwidth(TargetTransformInfo::RGK_ScalableVector) ||
-        CM.useMaxBandwidth(TargetTransformInfo::RGK_FixedWidthVector))
+    if (any_of(VFs, [this](ElementCount VF) {
+          return CM.shouldConsiderRegPressureForVF(VF);
+        }))
       RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
 
     for (unsigned I = 0; I < VFs.size(); I++) {
@@ -7098,7 +7016,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
       InstructionCost Cost = cost(*P, VF);
       VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
 
-      if (CM.shouldCalculateRegPressureForVF(VF) &&
+      if (CM.shouldConsiderRegPressureForVF(VF) &&
           RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs)) {
         LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width "
                           << VF << " because it uses too many registers\n");
@@ -7146,40 +7064,6 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
   return BestFactor;
 }
 
-static void addRuntimeUnrollDisableMetaData(Loop *L) {
-  SmallVector<Metadata *, 4> MDs;
-  // Reserve first location for self reference to the LoopID metadata node.
-  MDs.push_back(nullptr);
-  bool IsUnrollMetadata = false;
-  MDNode *LoopID = L->getLoopID();
-  if (LoopID) {
-    // First find existing loop unrolling disable metadata.
-    for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) {
-      auto *MD = dyn_cast<MDNode>(LoopID->getOperand(I));
-      if (MD) {
-        const auto *S = dyn_cast<MDString>(MD->getOperand(0));
-        IsUnrollMetadata =
-            S && S->getString().starts_with("llvm.loop.unroll.disable");
-      }
-      MDs.push_back(LoopID->getOperand(I));
-    }
-  }
-
-  if (!IsUnrollMetadata) {
-    // Add runtime unroll disable metadata.
-    LLVMContext &Context = L->getHeader()->getContext();
-    SmallVector<Metadata *, 1> DisableOperands;
-    DisableOperands.push_back(
-        MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
-    MDNode *DisableNode = MDNode::get(Context, DisableOperands);
-    MDs.push_back(DisableNode);
-    MDNode *NewLoopID = MDNode::get(Context, MDs);
-    // Set operand 0 to refer to the loop id itself.
-    NewLoopID->replaceOperandWith(0, NewLoopID);
-    L->setLoopID(NewLoopID);
-  }
-}
-
 static Value *getStartValueFromReductionResult(VPInstruction *RdxResult) {
   using namespace VPlanPatternMatch;
   assert(RdxResult->getOpcode() == VPInstruction::ComputeFindIVResult &&
@@ -7193,7 +7077,7 @@ static Value *getStartValueFromReductionResult(VPInstruction *RdxResult) {
 // epilog loop, fix the reduction's scalar PHI node by adding the incoming value
 // from the main vector loop.
 static void fixReductionScalarResumeWhenVectorizingEpilog(
-    VPPhi *EpiResumePhiR, VPTransformState &State, BasicBlock *BypassBlock) {
+    VPPhi *EpiResumePhiR, PHINode &EpiResumePhi, BasicBlock *BypassBlock) {
   // Get the VPInstruction computing the reduction result in the middle block.
   // The first operand may not be from the middle block if it is not connected
   // to the scalar preheader. In that case, there's nothing to fix.
@@ -7248,8 +7132,7 @@ static void fixReductionScalarResumeWhenVectorizingEpilog(
   // When fixing reductions in the epilogue loop we should already have
   // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
   // over the incoming values correctly.
-  auto *EpiResumePhi = cast<PHINode>(State.get(EpiResumePhiR, true));
-  EpiResumePhi->setIncomingValueForBlock(
+  EpiResumePhi.setIncomingValueForBlock(
       BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock));
 }
 
@@ -7276,11 +7159,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
                              BestVPlan, BestVF, VScale);
   }
 
-  if (!VectorizingEpilogue) {
-    // Checks are the same for all VPlans, added to BestVPlan only for
-    // compactness.
-    attachRuntimeChecks(BestVPlan, ILV.RTChecks, HasBranchWeights);
-  }
+  // Checks are the same for all VPlans, added to BestVPlan only for
+  // compactness.
+  attachRuntimeChecks(BestVPlan, ILV.RTChecks, HasBranchWeights);
 
   // Retrieving VectorPH now when it's easier while VPlan still has Regions.
   VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader());
@@ -7291,6 +7172,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   VPlanTransforms::narrowInterleaveGroups(
       BestVPlan, BestVF,
       TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
+  VPlanTransforms::cse(BestVPlan);
   VPlanTransforms::removeDeadRecipes(BestVPlan);
 
   VPlanTransforms::convertToConcreteRecipes(BestVPlan);
@@ -7327,8 +7209,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
 
   // 1. Set up the skeleton for vectorization, including vector pre-header and
   // middle block. The vector loop is created during VPlan execution.
-  BasicBlock *EntryBB =
-      cast<VPIRBasicBlock>(BestVPlan.getEntry())->getIRBasicBlock();
   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
   replaceVPBBWithIRVPBB(BestVPlan.getScalarPreheader(),
                         State.CFG.PrevBB->getSingleSuccessor());
@@ -7342,7 +7222,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   // looked through single-entry phis.
   ScalarEvolution &SE = *PSE.getSE();
   for (VPIRBasicBlock *Exit : BestVPlan.getExitBlocks()) {
-    if (Exit->getNumPredecessors() == 0)
+    if (!Exit->hasPredecessors())
       continue;
     for (VPRecipeBase &PhiR : Exit->phis())
       SE.forgetLcssaPhiWithNewPredecessor(
@@ -7362,88 +7242,22 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   //
   //===------------------------------------------------===//
 
-  // Move check blocks to their final position.
-  // TODO: Move as part of VPIRBB execute and update impacted tests.
-  if (BasicBlock *MemCheckBlock = ILV.RTChecks.getMemRuntimeChecks().second)
-    MemCheckBlock->moveAfter(EntryBB);
-  if (BasicBlock *SCEVCheckBlock = ILV.RTChecks.getSCEVChecks().second)
-    SCEVCheckBlock->moveAfter(EntryBB);
-
   BestVPlan.execute(&State);
 
-  // 2.5 When vectorizing the epilogue, fix reduction resume values from the
-  // additional bypass block.
-  if (VectorizingEpilogue) {
-    assert(!BestVPlan.hasEarlyExit() &&
-           "Epilogue vectorisation not yet supported with early exits");
-    BasicBlock *PH = OrigLoop->getLoopPreheader();
-    BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock();
-    for (auto *Pred : predecessors(PH)) {
-      for (PHINode &Phi : PH->phis()) {
-        if (Phi.getBasicBlockIndex(Pred) != -1)
-          continue;
-        Phi.addIncoming(Phi.getIncomingValueForBlock(BypassBlock), Pred);
-      }
-    }
-    VPBasicBlock *ScalarPH = BestVPlan.getScalarPreheader();
-    if (ScalarPH->getNumPredecessors() > 0) {
-      // If ScalarPH has predecessors, we may need to update its reduction
-      // resume values.
-      for (VPRecipeBase &R : ScalarPH->phis()) {
-        fixReductionScalarResumeWhenVectorizingEpilog(cast<VPPhi>(&R), State,
-                                                      BypassBlock);
-      }
-    }
-  }
-
   // 2.6. Maintain Loop Hints
   // Keep all loop hints from the original loop on the vector loop (we'll
   // replace the vectorizer-specific hints below).
   VPBasicBlock *HeaderVPBB = vputils::getFirstLoopHeader(BestVPlan, State.VPDT);
-  if (HeaderVPBB) {
-    MDNode *OrigLoopID = OrigLoop->getLoopID();
-
-    std::optional<MDNode *> VectorizedLoopID =
-        makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
-                                        LLVMLoopVectorizeFollowupVectorized});
-
-    Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
-    if (VectorizedLoopID) {
-      L->setLoopID(*VectorizedLoopID);
-    } else {
-      // Keep all loop hints from the original loop on the vector loop (we'll
-      // replace the vectorizer-specific hints below).
-      if (MDNode *LID = OrigLoop->getLoopID())
-        L->setLoopID(LID);
-
-      LoopVectorizeHints Hints(L, true, *ORE);
-      Hints.setAlreadyVectorized();
-
-      // Check if it's EVL-vectorized and mark the corresponding metadata.
-      bool IsEVLVectorized =
-          llvm::any_of(*HeaderVPBB, [](const VPRecipeBase &Recipe) {
-            // Looking for the ExplictVectorLength VPInstruction.
-            if (const auto *VI = dyn_cast<VPInstruction>(&Recipe))
-              return VI->getOpcode() == VPInstruction::ExplicitVectorLength;
-            return false;
-          });
-      if (IsEVLVectorized) {
-        LLVMContext &Context = L->getHeader()->getContext();
-        MDNode *LoopID = L->getLoopID();
-        auto *IsEVLVectorizedMD = MDNode::get(
-            Context,
-            {MDString::get(Context, "llvm.loop.isvectorized.tailfoldingstyle"),
-             MDString::get(Context, "evl")});
-        MDNode *NewLoopID = makePostTransformationMetadata(Context, LoopID, {},
-                                                           {IsEVLVectorizedMD});
-        L->setLoopID(NewLoopID);
-      }
-    }
-    TargetTransformInfo::UnrollingPreferences UP;
-    TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
-    if (!UP.UnrollVectorizedLoop || VectorizingEpilogue)
-      addRuntimeUnrollDisableMetaData(L);
-  }
+  // Add metadata to disable runtime unrolling a scalar loop when there
+  // are no runtime checks about strides and memory. A scalar loop that is
+  // rarely used is not worth unrolling.
+  bool DisableRuntimeUnroll = !ILV.RTChecks.hasChecks() && !BestVF.isScalar();
+  updateLoopMetadataAndProfileInfo(
+      HeaderVPBB ? LI->getLoopFor(State.CFG.VPBB2IRBB.lookup(HeaderVPBB))
+                 : nullptr,
+      HeaderVPBB, VectorizingEpilogue,
+      estimateElementCount(BestVF * BestUF, CM.getVScaleForTuning()),
+      DisableRuntimeUnroll);
 
   // 3. Fix the vectorized code: take care of header phi's, live-outs,
   //    predication, updating analyses.
@@ -7460,15 +7274,18 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
 
 /// This function is partially responsible for generating the control flow
 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
-BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
-  createVectorLoopSkeleton("");
+BasicBlock *EpilogueVectorizerMainLoop::createVectorizedLoopSkeleton() {
+  BasicBlock *ScalarPH = createScalarPreheader("");
+  BasicBlock *VectorPH = ScalarPH->getSinglePredecessor();
 
   // Generate the code to check the minimum iteration count of the vector
   // epilogue (see below).
   EPI.EpilogueIterationCountCheck =
-      emitIterationCountCheck(LoopScalarPreHeader, true);
+      emitIterationCountCheck(VectorPH, ScalarPH, true);
   EPI.EpilogueIterationCountCheck->setName("iter.check");
 
+  VectorPH = cast<BranchInst>(EPI.EpilogueIterationCountCheck->getTerminator())
+                 ->getSuccessor(1);
   // Generate the iteration count check for the main loop, *after* the check
   // for the epilogue loop, so that the path-length is shorter for the case
   // that goes directly through the vector epilogue. The longer-path length for
@@ -7476,9 +7293,10 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
   // trip count. Note: the branch will get updated later on when we vectorize
   // the epilogue.
   EPI.MainLoopIterationCountCheck =
-      emitIterationCountCheck(LoopScalarPreHeader, false);
+      emitIterationCountCheck(VectorPH, ScalarPH, false);
 
-  return LoopVectorPreHeader;
+  return cast<BranchInst>(EPI.MainLoopIterationCountCheck->getTerminator())
+      ->getSuccessor(1);
 }
 
 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
@@ -7498,35 +7316,33 @@ void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
   });
 }
 
-BasicBlock *
-EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
-                                                    bool ForEpilogue) {
+BasicBlock *EpilogueVectorizerMainLoop::emitIterationCountCheck(
+    BasicBlock *VectorPH, BasicBlock *Bypass, bool ForEpilogue) {
   assert(Bypass && "Expected valid bypass basic block.");
   Value *Count = getTripCount();
   MinProfitableTripCount = ElementCount::getFixed(0);
-  Value *CheckMinIters =
-      createIterationCountCheck(ForEpilogue ? EPI.EpilogueVF : EPI.MainLoopVF,
-                                ForEpilogue ? EPI.EpilogueUF : EPI.MainLoopUF);
+  Value *CheckMinIters = createIterationCountCheck(
+      VectorPH, ForEpilogue ? EPI.EpilogueVF : EPI.MainLoopVF,
+      ForEpilogue ? EPI.EpilogueUF : EPI.MainLoopUF);
 
-  BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
+  BasicBlock *const TCCheckBlock = VectorPH;
   if (!ForEpilogue)
     TCCheckBlock->setName("vector.main.loop.iter.check");
 
   // Create new preheader for vector loop.
-  LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
-                                   static_cast<DominatorTree *>(nullptr), LI,
-                                   nullptr, "vector.ph");
+  VectorPH = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
+                        static_cast<DominatorTree *>(nullptr), LI, nullptr,
+                        "vector.ph");
   if (ForEpilogue) {
     // Save the trip count so we don't have to regenerate it in the
     // vec.epilog.iter.check. This is safe to do because the trip count
     // generated here dominates the vector epilog iter check.
     EPI.TripCount = Count;
   } else {
-    VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, LoopVectorPreHeader);
+    VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, VectorPH);
   }
 
-  BranchInst &BI =
-      *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
+  BranchInst &BI = *BranchInst::Create(Bypass, VectorPH, CheckMinIters);
   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
     setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
@@ -7546,19 +7362,18 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
 
 /// This function is partially responsible for generating the control flow
 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
-BasicBlock *
-EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
-  createVectorLoopSkeleton("vec.epilog.");
-
+BasicBlock *EpilogueVectorizerEpilogueLoop::createVectorizedLoopSkeleton() {
+  BasicBlock *ScalarPH = createScalarPreheader("vec.epilog.");
+  BasicBlock *VectorPH = ScalarPH->getSinglePredecessor();
   // Now, compare the remaining count and if there aren't enough iterations to
   // execute the vectorized epilogue skip to the scalar part.
-  LoopVectorPreHeader->setName("vec.epilog.ph");
+  VectorPH->setName("vec.epilog.ph");
   BasicBlock *VecEpilogueIterationCountCheck =
-      SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->begin(), DT, LI,
-                 nullptr, "vec.epilog.iter.check", true);
-  VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, LoopVectorPreHeader);
+      SplitBlock(VectorPH, VectorPH->begin(), DT, LI, nullptr,
+                 "vec.epilog.iter.check", true);
+  VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, VectorPH);
 
-  emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
+  emitMinimumVectorEpilogueIterCountCheck(VectorPH, ScalarPH,
                                           VecEpilogueIterationCountCheck);
   AdditionalBypassBlock = VecEpilogueIterationCountCheck;
 
@@ -7567,23 +7382,22 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
          "expected this to be saved from the previous pass.");
   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
-      VecEpilogueIterationCountCheck, LoopVectorPreHeader);
+      VecEpilogueIterationCountCheck, VectorPH);
 
   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
-      VecEpilogueIterationCountCheck, LoopScalarPreHeader);
+      VecEpilogueIterationCountCheck, ScalarPH);
 
   // Adjust the terminators of runtime check blocks and phis using them.
   BasicBlock *SCEVCheckBlock = RTChecks.getSCEVChecks().second;
   BasicBlock *MemCheckBlock = RTChecks.getMemRuntimeChecks().second;
   if (SCEVCheckBlock)
     SCEVCheckBlock->getTerminator()->replaceUsesOfWith(
-        VecEpilogueIterationCountCheck, LoopScalarPreHeader);
+        VecEpilogueIterationCountCheck, ScalarPH);
   if (MemCheckBlock)
     MemCheckBlock->getTerminator()->replaceUsesOfWith(
-        VecEpilogueIterationCountCheck, LoopScalarPreHeader);
+        VecEpilogueIterationCountCheck, ScalarPH);
 
-  DT->changeImmediateDominator(LoopScalarPreHeader,
-                               EPI.EpilogueIterationCountCheck);
+  DT->changeImmediateDominator(ScalarPH, EPI.EpilogueIterationCountCheck);
 
   // The vec.epilog.iter.check block may contain Phi nodes from inductions or
   // reductions which merge control-flow from the latch block and the middle
@@ -7592,7 +7406,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
       llvm::make_pointer_range(VecEpilogueIterationCountCheck->phis()));
 
   for (PHINode *Phi : PhisInBlock) {
-    Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHIIt());
+    Phi->moveBefore(VectorPH->getFirstNonPHIIt());
     Phi->replaceIncomingBlockWith(
         VecEpilogueIterationCountCheck->getSinglePredecessor(),
         VecEpilogueIterationCountCheck);
@@ -7612,12 +7426,12 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
       Phi->removeIncomingValue(MemCheckBlock);
   }
 
-  return LoopVectorPreHeader;
+  return VectorPH;
 }
 
 BasicBlock *
 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
-    BasicBlock *Bypass, BasicBlock *Insert) {
+    BasicBlock *VectorPH, BasicBlock *Bypass, BasicBlock *Insert) {
 
   assert(EPI.TripCount &&
          "Expected trip count to have been saved in the first pass.");
@@ -7637,23 +7451,22 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
                                          EPI.EpilogueVF, EPI.EpilogueUF),
                          "min.epilog.iters.check");
 
-  BranchInst &BI =
-      *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
-  if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
-    auto VScale = Cost->getVScaleForTuning();
-    unsigned MainLoopStep =
-        estimateElementCount(EPI.MainLoopVF * EPI.MainLoopUF, VScale);
-    unsigned EpilogueLoopStep =
-        estimateElementCount(EPI.EpilogueVF * EPI.EpilogueUF, VScale);
-    // We assume the remaining `Count` is equally distributed in
-    // [0, MainLoopStep)
-    // So the probability for `Count < EpilogueLoopStep` should be
-    // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
-    unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
-    const uint32_t Weights[] = {EstimatedSkipCount,
-                                MainLoopStep - EstimatedSkipCount};
-    setBranchWeights(BI, Weights, /*IsExpected=*/false);
-  }
+  BranchInst &BI = *BranchInst::Create(Bypass, VectorPH, CheckMinIters);
+  auto VScale = Cost->getVScaleForTuning();
+  unsigned MainLoopStep =
+      estimateElementCount(EPI.MainLoopVF * EPI.MainLoopUF, VScale);
+  unsigned EpilogueLoopStep =
+      estimateElementCount(EPI.EpilogueVF * EPI.EpilogueUF, VScale);
+  // We assume the remaining `Count` is equally distributed in
+  // [0, MainLoopStep)
+  // So the probability for `Count < EpilogueLoopStep` should be
+  // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
+  // TODO: Improve the estimate by taking the estimated trip count into
+  // consideration.
+  unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
+  const uint32_t Weights[] = {EstimatedSkipCount,
+                              MainLoopStep - EstimatedSkipCount};
+  setBranchWeights(BI, Weights, /*IsExpected=*/false);
   ReplaceInstWithInst(Insert->getTerminator(), &BI);
 
   // A new entry block has been created for the epilogue VPlan. Hook it in, as
@@ -8634,8 +8447,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
             return !CM.requiresScalarEpilogue(VF.isVector());
           },
           Range);
-  VPlanTransforms::handleEarlyExits(*Plan, Legal->hasUncountableEarlyExit(),
-                                    Range);
+  VPlanTransforms::handleEarlyExits(*Plan, Legal->hasUncountableEarlyExit());
   VPlanTransforms::addMiddleCheck(*Plan, RequiresScalarEpilogueCheck,
                                   CM.foldTailByMasking());
 
@@ -8761,10 +8573,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
 
       VPRecipeBase *Recipe =
           RecipeBuilder.tryToCreateWidenRecipe(SingleDef, Range);
-      if (!Recipe) {
-        SmallVector<VPValue *, 4> Operands(R.operands());
-        Recipe = RecipeBuilder.handleReplication(Instr, Operands, Range);
-      }
+      if (!Recipe)
+        Recipe = RecipeBuilder.handleReplication(Instr, R.operands(), Range);
 
       RecipeBuilder.setRecipe(Instr, Recipe);
       if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && isa<TruncInst>(Instr)) {
@@ -8790,7 +8600,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   // to remove the need to keep a map of masks beyond the predication
   // transform.
   RecipeBuilder.updateBlockMaskCache(Old2New);
-  for (const auto &[Old, _] : Old2New)
+  for (VPValue *Old : Old2New.keys())
     Old->getDefiningRecipe()->eraseFromParent();
 
   assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
@@ -8851,41 +8661,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
                            InterleaveGroups, RecipeBuilder,
                            CM.isScalarEpilogueAllowed());
 
-  // Replace VPValues for known constant strides guaranteed by predicate scalar
-  // evolution.
-  auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
-    auto *R = cast<VPRecipeBase>(&U);
-    return R->getParent()->getParent() ||
-           R->getParent() ==
-               Plan->getVectorLoopRegion()->getSinglePredecessor();
-  };
-  for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
-    auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
-    auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
-    // Only handle constant strides for now.
-    if (!ScevStride)
-      continue;
-
-    auto *CI = Plan->getOrAddLiveIn(
-        ConstantInt::get(Stride->getType(), ScevStride->getAPInt()));
-    if (VPValue *StrideVPV = Plan->getLiveIn(StrideV))
-      StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
-
-    // The versioned value may not be used in the loop directly but through a
-    // sext/zext. Add new live-ins in those cases.
-    for (Value *U : StrideV->users()) {
-      if (!isa<SExtInst, ZExtInst>(U))
-        continue;
-      VPValue *StrideVPV = Plan->getLiveIn(U);
-      if (!StrideVPV)
-        continue;
-      unsigned BW = U->getType()->getScalarSizeInBits();
-      APInt C = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW)
-                                 : ScevStride->getAPInt().zext(BW);
-      VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(U->getType(), C));
-      StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
-    }
-  }
+  // Replace VPValues for known constant strides.
+  VPlanTransforms::runPass(VPlanTransforms::replaceSymbolicStrides, *Plan, PSE,
+                           Legal->getLAI()->getSymbolicStrides());
 
   auto BlockNeedsPredication = [this](BasicBlock *BB) {
     return Legal->blockNeedsPredication(BB);
@@ -8926,7 +8704,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
       OrigLoop, *LI, Legal->getWidestInductionType(),
       getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE);
   VPlanTransforms::handleEarlyExits(*Plan,
-                                    /*HasUncountableExit*/ false, Range);
+                                    /*HasUncountableExit*/ false);
   VPlanTransforms::addMiddleCheck(*Plan, /*RequiresScalarEpilogue*/ true,
                                   /*TailFolded*/ false);
 
@@ -9316,7 +9094,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
 void LoopVectorizationPlanner::attachRuntimeChecks(
     VPlan &Plan, GeneratedRTChecks &RTChecks, bool HasBranchWeights) const {
   const auto &[SCEVCheckCond, SCEVCheckBlock] = RTChecks.getSCEVChecks();
-  if (SCEVCheckBlock) {
+  if (SCEVCheckBlock && SCEVCheckBlock->hasNPredecessors(0)) {
     assert((!CM.OptForSize ||
             CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
            "Cannot SCEV check stride or overflow when optimizing for size");
@@ -9324,7 +9102,7 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
                                       HasBranchWeights);
   }
   const auto &[MemCheckCond, MemCheckBlock] = RTChecks.getMemRuntimeChecks();
-  if (MemCheckBlock) {
+  if (MemCheckBlock && MemCheckBlock->hasNPredecessors(0)) {
     // VPlan-native path does not do any analysis for runtime checks
     // currently.
     assert((!EnableVPlanNativePath || OrigLoop->isInnermost()) &&
@@ -9350,6 +9128,29 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
   }
 }
 
+void LoopVectorizationPlanner::addMinimumIterationCheck(
+    VPlan &Plan, ElementCount VF, unsigned UF,
+    ElementCount MinProfitableTripCount) const {
+  // vscale is not necessarily a power-of-2, which means we cannot guarantee
+  // an overflow to zero when updating induction variables and so an
+  // additional overflow check is required before entering the vector loop.
+  bool IsIndvarOverflowCheckNeededForVF =
+      VF.isScalable() && !TTI.isVScaleKnownToBeAPowerOfTwo() &&
+      !isIndvarOverflowCheckKnownFalse(&CM, VF, UF) &&
+      CM.getTailFoldingStyle() !=
+          TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
+  const uint32_t *BranchWeigths =
+      hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())
+          ? &MinItersBypassWeights[0]
+          : nullptr;
+  VPlanTransforms::addMinimumIterationCheck(
+      Plan, VF, UF, MinProfitableTripCount,
+      CM.requiresScalarEpilogue(VF.isVector()), CM.foldTailByMasking(),
+      IsIndvarOverflowCheckNeededForVF, OrigLoop, BranchWeigths,
+      OrigLoop->getLoopPredecessor()->getTerminator()->getDebugLoc(),
+      *PSE.getSE());
+}
+
 void VPDerivedIVRecipe::execute(VPTransformState &State) {
   assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
 
@@ -9465,17 +9266,18 @@ static bool processLoopInVPlanNativePath(
 
   {
     GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
-    InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, VF.Width, 1, &CM,
+    InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, /*UF=*/1, &CM,
                            BFI, PSI, Checks, BestPlan);
     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
                       << L->getHeader()->getParent()->getName() << "\"\n");
-    LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
+    LVP.addMinimumIterationCheck(BestPlan, VF.Width, /*UF=*/1,
+                                 VF.MinProfitableTripCount);
+
+    LVP.executePlan(VF.Width, /*UF=*/1, BestPlan, LB, DT, false);
   }
 
   reportVectorization(ORE, L, VF, 1);
 
-  // Mark the loop as already vectorized to avoid vectorizing again.
-  Hints.setAlreadyVectorized();
   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
   return true;
 }
@@ -9929,6 +9731,43 @@ static Value *createInductionAdditionalBypassValues(
   return EndValueFromAdditionalBypass;
 }
 
+static void fixScalarResumeValuesFromBypass(BasicBlock *BypassBlock, Loop *L,
+                                            VPlan &BestEpiPlan,
+                                            LoopVectorizationLegality &LVL,
+                                            const SCEV2ValueTy &ExpandedSCEVs,
+                                            Value *MainVectorTripCount) {
+  // Fix reduction resume values from the additional bypass block.
+  BasicBlock *PH = L->getLoopPreheader();
+  for (auto *Pred : predecessors(PH)) {
+    for (PHINode &Phi : PH->phis()) {
+      if (Phi.getBasicBlockIndex(Pred) != -1)
+        continue;
+      Phi.addIncoming(Phi.getIncomingValueForBlock(BypassBlock), Pred);
+    }
+  }
+  auto *ScalarPH = cast<VPIRBasicBlock>(BestEpiPlan.getScalarPreheader());
+  if (ScalarPH->hasPredecessors()) {
+    // If ScalarPH has predecessors, we may need to update its reduction
+    // resume values.
+    for (const auto &[R, IRPhi] :
+         zip(ScalarPH->phis(), ScalarPH->getIRBasicBlock()->phis())) {
+      fixReductionScalarResumeWhenVectorizingEpilog(cast<VPPhi>(&R), IRPhi,
+                                                    BypassBlock);
+    }
+  }
+
+  // Fix induction resume values from the additional bypass block.
+  IRBuilder<> BypassBuilder(BypassBlock, BypassBlock->getFirstInsertionPt());
+  for (const auto &[IVPhi, II] : LVL.getInductionVars()) {
+    auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
+    Value *V = createInductionAdditionalBypassValues(
+        IVPhi, II, BypassBuilder, ExpandedSCEVs, MainVectorTripCount,
+        LVL.getPrimaryInduction());
+    // TODO: Directly add as extra operand to the VPResumePHI recipe.
+    Inc->setIncomingValueForBlock(BypassBlock, V);
+  }
+}
+
 bool LoopVectorizePass::processLoop(Loop *L) {
   assert((EnableVPlanNativePath || L->isInnermost()) &&
          "VPlan-native path is not enabled. Only process inner loops.");
@@ -9971,7 +9810,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   // Check if it is legal to vectorize the loop.
   LoopVectorizationRequirements Requirements;
   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
-                                &Requirements, &Hints, DB, AC, BFI, PSI);
+                                &Requirements, &Hints, DB, AC, BFI, PSI, AA);
   if (!LVL.canVectorize(EnableVPlanNativePath)) {
     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
     Hints.emitRemarkWithHints();
@@ -9985,6 +9824,13 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     return false;
   }
 
+  if (!LVL.getPotentiallyFaultingLoads().empty()) {
+    reportVectorizationFailure("Auto-vectorization of loops with potentially "
+                               "faulting load is not supported",
+                               "PotentiallyFaultingLoadsNotSupported", ORE, L);
+    return false;
+  }
+
   // Entrance to the VPlan-native vectorization path. Outer loops are processed
   // here. They may require CFG and instruction level transformations before
   // even evaluating whether vectorization is profitable. Since we cannot modify
@@ -10251,128 +10097,80 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
   }
 
-  bool DisableRuntimeUnroll = false;
-  MDNode *OrigLoopID = L->getLoopID();
-  {
+  // Report the vectorization decision.
+  if (VF.Width.isScalar()) {
     using namespace ore;
-    if (!VectorizeLoop) {
-      assert(IC > 1 && "interleave count should not be 1 or 0");
-      // If we decided that it is not legal to vectorize the loop, then
-      // interleave it.
-      VPlan &BestPlan = LVP.getPlanFor(VF.Width);
-      InnerLoopVectorizer Unroller(
-          L, PSE, LI, DT, TTI, AC, ElementCount::getFixed(1),
-          ElementCount::getFixed(1), IC, &CM, BFI, PSI, Checks, BestPlan);
-
-      // TODO: Move to general VPlan pipeline once epilogue loops are also
-      // supported.
-      VPlanTransforms::runPass(
-          VPlanTransforms::materializeConstantVectorTripCount, BestPlan,
-          VF.Width, IC, PSE);
-
-      LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
+    assert(IC > 1);
+    ORE->emit([&]() {
+      return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
+                                L->getHeader())
+             << "interleaved loop (interleaved count: "
+             << NV("InterleaveCount", IC) << ")";
+    });
+  } else {
+    // Report the vectorization decision.
+    reportVectorization(ORE, L, VF, IC);
+  }
+  if (ORE->allowExtraAnalysis(LV_NAME))
+    checkMixedPrecision(L, ORE);
 
-      ORE->emit([&]() {
-        return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
-                                  L->getHeader())
-               << "interleaved loop (interleaved count: "
-               << NV("InterleaveCount", IC) << ")";
-      });
-    } else {
-      // If we decided that it is *legal* to vectorize the loop, then do it.
-
-      VPlan &BestPlan = LVP.getPlanFor(VF.Width);
-      // Consider vectorizing the epilogue too if it's profitable.
-      VectorizationFactor EpilogueVF =
-          LVP.selectEpilogueVectorizationFactor(VF.Width, IC);
-      if (EpilogueVF.Width.isVector()) {
-        std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
-
-        // The first pass vectorizes the main loop and creates a scalar epilogue
-        // to be vectorized by executing the plan (potentially with a different
-        // factor) again shortly afterwards.
-        VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width);
-        BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block");
-        preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
-        EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
-                                          BestEpiPlan);
-        EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
-                                           BFI, PSI, Checks, *BestMainPlan);
-        auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
-                                             *BestMainPlan, MainILV, DT, false);
-        ++LoopsVectorized;
-
-        // Second pass vectorizes the epilogue and adjusts the control flow
-        // edges from the first pass.
-        EpilogueVectorizerEpilogueLoop EpilogILV(
-            L, PSE, LI, DT, TTI, AC, EPI, &CM, BFI, PSI, Checks, BestEpiPlan);
-        EpilogILV.setTripCount(MainILV.getTripCount());
-        preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI);
-
-        LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
-                        DT, true);
-
-        // Fix induction resume values from the additional bypass block.
-        BasicBlock *BypassBlock = EpilogILV.getAdditionalBypassBlock();
-        IRBuilder<> BypassBuilder(BypassBlock,
-                                  BypassBlock->getFirstInsertionPt());
-        BasicBlock *PH = L->getLoopPreheader();
-        for (const auto &[IVPhi, II] : LVL.getInductionVars()) {
-          auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
-          Value *V = createInductionAdditionalBypassValues(
-              IVPhi, II, BypassBuilder, ExpandedSCEVs, EPI.VectorTripCount,
-              LVL.getPrimaryInduction());
-          // TODO: Directly add as extra operand to the VPResumePHI recipe.
-          Inc->setIncomingValueForBlock(BypassBlock, V);
-        }
-        ++LoopsEpilogueVectorized;
+  // If we decided that it is *legal* to interleave or vectorize the loop, then
+  // do it.
 
-        if (!Checks.hasChecks())
-          DisableRuntimeUnroll = true;
-      } else {
-        InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width,
-                               VF.MinProfitableTripCount, IC, &CM, BFI, PSI,
-                               Checks, BestPlan);
-        // TODO: Move to general VPlan pipeline once epilogue loops are also
-        // supported.
-        VPlanTransforms::runPass(
-            VPlanTransforms::materializeConstantVectorTripCount, BestPlan,
-            VF.Width, IC, PSE);
-
-        LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
-        ++LoopsVectorized;
-
-        // Add metadata to disable runtime unrolling a scalar loop when there
-        // are no runtime checks about strides and memory. A scalar loop that is
-        // rarely used is not worth unrolling.
-        if (!Checks.hasChecks())
-          DisableRuntimeUnroll = true;
-      }
-      // Report the vectorization decision.
-      reportVectorization(ORE, L, VF, IC);
-    }
+  VPlan &BestPlan = LVP.getPlanFor(VF.Width);
+  // Consider vectorizing the epilogue too if it's profitable.
+  VectorizationFactor EpilogueVF =
+      LVP.selectEpilogueVectorizationFactor(VF.Width, IC);
+  if (EpilogueVF.Width.isVector()) {
+    std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
+
+    // The first pass vectorizes the main loop and creates a scalar epilogue
+    // to be vectorized by executing the plan (potentially with a different
+    // factor) again shortly afterwards.
+    VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width);
+    BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block");
+    preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
+    EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
+                                      BestEpiPlan);
+    EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TTI, AC, EPI, &CM, BFI,
+                                       PSI, Checks, *BestMainPlan);
+    auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
+                                         *BestMainPlan, MainILV, DT, false);
+    ++LoopsVectorized;
+
+    // Second pass vectorizes the epilogue and adjusts the control flow
+    // edges from the first pass.
+    EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
+                                             BFI, PSI, Checks, BestEpiPlan);
+    EpilogILV.setTripCount(MainILV.getTripCount());
+    preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI);
+
+    LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, DT,
+                    true);
+
+    fixScalarResumeValuesFromBypass(EpilogILV.getAdditionalBypassBlock(), L,
+                                    BestEpiPlan, LVL, ExpandedSCEVs,
+                                    EPI.VectorTripCount);
+    ++LoopsEpilogueVectorized;
+  } else {
+    InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, IC, &CM, BFI, PSI,
+                           Checks, BestPlan);
+    // TODO: Move to general VPlan pipeline once epilogue loops are also
+    // supported.
+    VPlanTransforms::runPass(
+        VPlanTransforms::materializeConstantVectorTripCount, BestPlan, VF.Width,
+        IC, PSE);
+    LVP.addMinimumIterationCheck(BestPlan, VF.Width, IC,
+                                 VF.MinProfitableTripCount);
 
-    if (ORE->allowExtraAnalysis(LV_NAME))
-      checkMixedPrecision(L, ORE);
+    LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
+    ++LoopsVectorized;
   }
 
   assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
          "DT not preserved correctly");
+  assert(!verifyFunction(*F, &dbgs()));
 
-  std::optional<MDNode *> RemainderLoopID =
-      makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
-                                      LLVMLoopVectorizeFollowupEpilogue});
-  if (RemainderLoopID) {
-    L->setLoopID(*RemainderLoopID);
-  } else {
-    if (DisableRuntimeUnroll)
-      addRuntimeUnrollDisableMetaData(L);
-
-    // Mark the loop as already vectorized to avoid vectorizing again.
-    Hints.setAlreadyVectorized();
-  }
-
-  assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
   return true;
 }
 
@@ -10449,6 +10247,7 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
   DB = &AM.getResult<DemandedBitsAnalysis>(F);
   ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
   LAIs = &AM.getResult<LoopAccessAnalysis>(F);
+  AA = &AM.getResult<AAManager>(F);
 
   auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
   PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 37dc41413966..6a56dbfaa015 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -967,9 +967,7 @@ class BinOpSameOpcodeHelper {
       return false;
     }
     bool equal(unsigned Opcode) {
-      if (Opcode == I->getOpcode())
-        return trySet(MainOpBIT, MainOpBIT);
-      return false;
+      return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
     }
     unsigned getOpcode() const {
       MaskType Candidate = Mask & SeenBefore;
@@ -5576,7 +5574,23 @@ private:
       if (auto *SD = dyn_cast<ScheduleData>(Data)) {
         SD->setScheduled(/*Scheduled=*/true);
         LLVM_DEBUG(dbgs() << "SLP:   schedule " << *SD << "\n");
-        ProcessBundleMember(SD, {});
+        SmallVector<std::unique_ptr<ScheduleBundle>> PseudoBundles;
+        SmallVector<ScheduleBundle *> Bundles;
+        Instruction *In = SD->getInst();
+        if (R.isVectorized(In)) {
+          ArrayRef<TreeEntry *> Entries = R.getTreeEntries(In);
+          for (TreeEntry *TE : Entries) {
+            if (!isa<ExtractValueInst, ExtractElementInst, CallBase>(In) &&
+                In->getNumOperands() != TE->getNumOperands())
+              continue;
+            auto &BundlePtr =
+                PseudoBundles.emplace_back(std::make_unique<ScheduleBundle>());
+            BundlePtr->setTreeEntry(TE);
+            BundlePtr->add(SD);
+            Bundles.push_back(BundlePtr.get());
+          }
+        }
+        ProcessBundleMember(SD, Bundles);
       } else {
         ScheduleBundle &Bundle = *cast<ScheduleBundle>(Data);
         Bundle.setScheduled(/*Scheduled=*/true);
@@ -6325,17 +6339,11 @@ static bool isReverseOrder(ArrayRef<unsigned> Order) {
 }
 
 /// Checks if the provided list of pointers \p Pointers represents the strided
-/// pointers for type ElemTy. If they are not, std::nullopt is returned.
-/// Otherwise, if \p Inst is not specified, just initialized optional value is
-/// returned to show that the pointers represent strided pointers. If \p Inst
-/// specified, the runtime stride is materialized before the given \p Inst.
-/// \returns std::nullopt if the pointers are not pointers with the runtime
-/// stride, nullptr or actual stride value, otherwise.
-static std::optional<Value *>
-calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
-                  const DataLayout &DL, ScalarEvolution &SE,
-                  SmallVectorImpl<unsigned> &SortedIndices,
-                  Instruction *Inst = nullptr) {
+/// pointers for type ElemTy. If they are not, nullptr is returned.
+/// Otherwise, SCEV* of the stride value is returned.
+static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
+                                     const DataLayout &DL, ScalarEvolution &SE,
+                                     SmallVectorImpl<unsigned> &SortedIndices) {
   SmallVector<const SCEV *> SCEVs;
   const SCEV *PtrSCEVLowest = nullptr;
   const SCEV *PtrSCEVHighest = nullptr;
@@ -6344,7 +6352,7 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
   for (Value *Ptr : PointerOps) {
     const SCEV *PtrSCEV = SE.getSCEV(Ptr);
     if (!PtrSCEV)
-      return std::nullopt;
+      return nullptr;
     SCEVs.push_back(PtrSCEV);
     if (!PtrSCEVLowest && !PtrSCEVHighest) {
       PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
@@ -6352,14 +6360,14 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
     }
     const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
     if (isa<SCEVCouldNotCompute>(Diff))
-      return std::nullopt;
+      return nullptr;
     if (Diff->isNonConstantNegative()) {
       PtrSCEVLowest = PtrSCEV;
       continue;
     }
     const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
     if (isa<SCEVCouldNotCompute>(Diff1))
-      return std::nullopt;
+      return nullptr;
     if (Diff1->isNonConstantNegative()) {
       PtrSCEVHighest = PtrSCEV;
       continue;
@@ -6368,7 +6376,7 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
   // Dist = PtrSCEVHighest - PtrSCEVLowest;
   const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
   if (isa<SCEVCouldNotCompute>(Dist))
-    return std::nullopt;
+    return nullptr;
   int Size = DL.getTypeStoreSize(ElemTy);
   auto TryGetStride = [&](const SCEV *Dist,
                           const SCEV *Multiplier) -> const SCEV * {
@@ -6389,10 +6397,10 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
     const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
     Stride = TryGetStride(Dist, Sz);
     if (!Stride)
-      return std::nullopt;
+      return nullptr;
   }
   if (!Stride || isa<SCEVConstant>(Stride))
-    return std::nullopt;
+    return nullptr;
   // Iterate through all pointers and check if all distances are
   // unique multiple of Stride.
   using DistOrdPair = std::pair<int64_t, int>;
@@ -6406,28 +6414,28 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
       const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
       const SCEV *Coeff = TryGetStride(Diff, Stride);
       if (!Coeff)
-        return std::nullopt;
+        return nullptr;
       const auto *SC = dyn_cast<SCEVConstant>(Coeff);
       if (!SC || isa<SCEVCouldNotCompute>(SC))
-        return std::nullopt;
+        return nullptr;
       if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
                                                   SE.getMulExpr(Stride, SC)))
                ->isZero())
-        return std::nullopt;
+        return nullptr;
       Dist = SC->getAPInt().getZExtValue();
     }
     // If the strides are not the same or repeated, we can't vectorize.
     if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
-      return std::nullopt;
+      return nullptr;
     auto Res = Offsets.emplace(Dist, Cnt);
     if (!Res.second)
-      return std::nullopt;
+      return nullptr;
     // Consecutive order if the inserted element is the last one.
     IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
     ++Cnt;
   }
   if (Offsets.size() != SCEVs.size())
-    return std::nullopt;
+    return nullptr;
   SortedIndices.clear();
   if (!IsConsecutive) {
     // Fill SortedIndices array only if it is non-consecutive.
@@ -6438,10 +6446,7 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
       ++Cnt;
     }
   }
-  if (!Inst)
-    return nullptr;
-  SCEVExpander Expander(SE, DL, "strided-load-vec");
-  return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
+  return Stride;
 }
 
 static std::pair<InstructionCost, InstructionCost>
@@ -8030,11 +8035,11 @@ void BoUpSLP::reorderTopToBottom() {
         // it is an attempt to reorder node with reused scalars but with
         // external uses.
         if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
-          OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
+          OrdersUses.try_emplace(OrdersType(), 0).first->second +=
               ExternalUserReorderIndices.size();
         } else {
           for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
-            ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
+            ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
         }
         // No other useful reorder data in this entry.
         if (Order.empty())
@@ -8054,9 +8059,9 @@ void BoUpSLP::reorderTopToBottom() {
           return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
         });
         fixupOrderingIndices(CurrentOrder);
-        ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
+        ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
       } else {
-        ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
+        ++OrdersUses.try_emplace(Order, 0).first->second;
       }
     }
     if (OrdersUses.empty())
@@ -8480,12 +8485,11 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
             return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
           });
           fixupOrderingIndices(CurrentOrder);
-          OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
-              NumOps;
+          OrdersUses.try_emplace(CurrentOrder, 0).first->second += NumOps;
         } else {
-          OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
+          OrdersUses.try_emplace(Order, 0).first->second += NumOps;
         }
-        auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
+        auto Res = OrdersUses.try_emplace(OrdersType(), 0);
         const auto AllowsReordering = [&](const TreeEntry *TE) {
           if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
               (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
@@ -10639,8 +10643,19 @@ class InstructionsCompatibilityAnalysis {
         }
       }
     }
-    if (MainOp)
+    if (MainOp) {
+      // Do not match, if any copyable is a terminator from the same block as
+      // the main operation.
+      if (any_of(VL, [&](Value *V) {
+            auto *I = dyn_cast<Instruction>(V);
+            return I && I->getParent() == MainOp->getParent() &&
+                   I->isTerminator();
+          })) {
+        MainOp = nullptr;
+        return;
+      }
       MainOpcode = MainOp->getOpcode();
+    }
   }
 
   /// Returns the idempotent value for the \p MainOp with the detected \p
@@ -11013,7 +11028,10 @@ BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
       }
       SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
       if (all_of(VL, [&](Value *V) {
-            return isa<PoisonValue>(V) || Values.contains(V);
+            return isa<PoisonValue>(V) || Values.contains(V) ||
+                   (S.getOpcode() == Instruction::PHI && isa<PHINode>(V) &&
+                    LI->getLoopFor(S.getMainOp()->getParent()) &&
+                    isVectorized(V));
           })) {
         LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
         return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
@@ -17835,6 +17853,17 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
         IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
   }
 
+  Value *getVectorizedValue(const TreeEntry &E) {
+    Value *Vec = E.VectorizedValue;
+    if (!Vec->getType()->isIntOrIntVectorTy())
+      return Vec;
+    return castToScalarTyElem(Vec, any_of(E.Scalars, [&](Value *V) {
+                                return !isa<PoisonValue>(V) &&
+                                       !isKnownNonNegative(
+                                           V, SimplifyQuery(*R.DL));
+                              }));
+  }
+
 public:
   ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
       : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
@@ -18001,35 +18030,14 @@ public:
   /// Adds 2 input vectors (in form of tree entries) and the mask for their
   /// shuffling.
   void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
-    Value *V1 = E1.VectorizedValue;
-    if (V1->getType()->isIntOrIntVectorTy())
-      V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
-                                if (isa<PoisonValue>(V))
-                                  return false;
-                                return !isKnownNonNegative(
-                                    V, SimplifyQuery(*R.DL));
-                              }));
-    Value *V2 = E2.VectorizedValue;
-    if (V2->getType()->isIntOrIntVectorTy())
-      V2 = castToScalarTyElem(V2, any_of(E2.Scalars, [&](Value *V) {
-                                if (isa<PoisonValue>(V))
-                                  return false;
-                                return !isKnownNonNegative(
-                                    V, SimplifyQuery(*R.DL));
-                              }));
+    Value *V1 = getVectorizedValue(E1);
+    Value *V2 = getVectorizedValue(E2);
     add(V1, V2, Mask);
   }
   /// Adds single input vector (in form of tree entry) and the mask for its
   /// shuffling.
   void add(const TreeEntry &E1, ArrayRef<int> Mask) {
-    Value *V1 = E1.VectorizedValue;
-    if (V1->getType()->isIntOrIntVectorTy())
-      V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
-                                if (isa<PoisonValue>(V))
-                                  return false;
-                                return !isKnownNonNegative(
-                                    V, SimplifyQuery(*R.DL));
-                              }));
+    Value *V1 = getVectorizedValue(E1);
     add(V1, Mask);
   }
   /// Adds 2 input vectors and the mask for their shuffling.
@@ -18178,14 +18186,7 @@ public:
       auto CreateSubVectors = [&](Value *Vec,
                                   SmallVectorImpl<int> &CommonMask) {
         for (auto [E, Idx] : SubVectors) {
-          Value *V = E->VectorizedValue;
-          if (V->getType()->isIntOrIntVectorTy())
-            V = castToScalarTyElem(V, any_of(E->Scalars, [&](Value *V) {
-                                     if (isa<PoisonValue>(V))
-                                       return false;
-                                     return !isKnownNonNegative(
-                                         V, SimplifyQuery(*R.DL));
-                                   }));
+          Value *V = getVectorizedValue(*E);
           unsigned InsertionIndex = Idx * getNumElements(ScalarTy);
           // Use scalar version of the SCalarType to correctly handle shuffles
           // for revectorization. The revectorization mode operates by the
@@ -19526,11 +19527,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
             return cast<LoadInst>(V)->getPointerOperand();
           });
           OrdersType Order;
-          std::optional<Value *> Stride =
-              calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
-                                &*Builder.GetInsertPoint());
+          const SCEV *StrideSCEV =
+              calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order);
+          assert(StrideSCEV && "At this point stride should be known");
+          SCEVExpander Expander(*SE, *DL, "strided-load-vec");
+          Value *Stride = Expander.expandCodeFor(
+              StrideSCEV, StrideSCEV->getType(), &*Builder.GetInsertPoint());
           Value *NewStride =
-              Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true);
+              Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
           StrideVal = Builder.CreateMul(
               NewStride,
               ConstantInt::get(
@@ -20519,7 +20523,9 @@ Value *BoUpSLP::vectorizeTree(
           !(GatheredLoadsEntriesFirst.has_value() &&
             IE->Idx >= *GatheredLoadsEntriesFirst &&
             VectorizableTree.front()->isGather() &&
-            is_contained(VectorizableTree.front()->Scalars, I)))
+            is_contained(VectorizableTree.front()->Scalars, I)) &&
+          !(!VectorizableTree.front()->isGather() &&
+            VectorizableTree.front()->isCopyableElement(I)))
         continue;
       SmallVector<SelectInst *> LogicalOpSelects;
       I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
@@ -20782,6 +20788,14 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
           continue;
         }
         auto *SD = cast<ScheduleData>(SE);
+        if (SD->hasValidDependencies() &&
+            (!S.areInstructionsWithCopyableElements() ||
+             !S.isCopyableElement(SD->getInst())) &&
+            !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE &&
+            EI.UserTE->hasState() &&
+            (!EI.UserTE->hasCopyableElements() ||
+             !EI.UserTE->isCopyableElement(SD->getInst())))
+          SD->clearDirectDependencies();
         for (const Use &U : SD->getInst()->operands()) {
           unsigned &NumOps =
               UserOpToNumOps
@@ -20791,7 +20805,8 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
           if (auto *Op = dyn_cast<Instruction>(U.get());
               Op && areAllOperandsReplacedByCopyableData(SD->getInst(), Op,
                                                          *SLP, NumOps)) {
-            if (ScheduleData *OpSD = getScheduleData(Op)) {
+            if (ScheduleData *OpSD = getScheduleData(Op);
+                OpSD && OpSD->hasValidDependencies()) {
               OpSD->clearDirectDependencies();
               if (RegionHasStackSave ||
                   !isGuaranteedToTransferExecutionToSuccessor(OpSD->getInst()))
@@ -20977,7 +20992,8 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
           ScheduleCopyableDataMapByUsers.erase(I);
         ScheduleCopyableDataMap.erase(KV);
         // Need to recalculate dependencies for the actual schedule data.
-        if (ScheduleData *OpSD = getScheduleData(I)) {
+        if (ScheduleData *OpSD = getScheduleData(I);
+            OpSD && OpSD->hasValidDependencies()) {
           OpSD->clearDirectDependencies();
           if (RegionHasStackSave ||
               !isGuaranteedToTransferExecutionToSuccessor(OpSD->getInst()))
@@ -21881,6 +21897,10 @@ bool BoUpSLP::collectValuesToDemote(
     return TryProcessInstruction(BitWidth);
   case Instruction::ZExt:
   case Instruction::SExt:
+    if (E.UserTreeIndex.UserTE && E.UserTreeIndex.UserTE->hasState() &&
+        E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
+        E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
+      return false;
     IsProfitableToDemote = true;
     return TryProcessInstruction(BitWidth);
 
@@ -23797,9 +23817,7 @@ public:
         size_t Key, Idx;
         std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
                                                /*AllowAlternate=*/false);
-        ++PossibleReducedVals[Key][Idx]
-              .insert(std::make_pair(V, 0))
-              .first->second;
+        ++PossibleReducedVals[Key][Idx].try_emplace(V, 0).first->second;
       }
       for (Instruction *I : reverse(PossibleReductionOps))
         Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
@@ -23820,21 +23838,20 @@ public:
       stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
         return P1.size() > P2.size();
       });
-      int NewIdx = -1;
+      bool First = true;
       for (ArrayRef<Value *> Data : PossibleRedValsVect) {
-        if (NewIdx < 0 ||
-            (!isGoodForReduction(Data) &&
-             (!isa<LoadInst>(Data.front()) ||
-              !isa<LoadInst>(ReducedVals[NewIdx].front()) ||
-              getUnderlyingObject(
-                  cast<LoadInst>(Data.front())->getPointerOperand()) !=
-                  getUnderlyingObject(
-                      cast<LoadInst>(ReducedVals[NewIdx].front())
-                          ->getPointerOperand())))) {
-          NewIdx = ReducedVals.size();
+        if (First) {
+          First = false;
           ReducedVals.emplace_back();
+        } else if (!isGoodForReduction(Data)) {
+          auto *LI = dyn_cast<LoadInst>(Data.front());
+          auto *LastLI = dyn_cast<LoadInst>(ReducedVals.back().front());
+          if (!LI || !LastLI ||
+              getUnderlyingObject(LI->getPointerOperand()) !=
+                  getUnderlyingObject(LastLI->getPointerOperand()))
+            ReducedVals.emplace_back();
         }
-        ReducedVals[NewIdx].append(Data.rbegin(), Data.rend());
+        ReducedVals.back().append(Data.rbegin(), Data.rend());
       }
     }
     // Sort the reduced values by number of same/alternate opcode and/or pointer
@@ -23847,7 +23864,8 @@ public:
 
   /// Attempt to vectorize the tree found by matchAssociativeReduction.
   Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
-                     const TargetLibraryInfo &TLI, AssumptionCache *AC) {
+                     const TargetLibraryInfo &TLI, AssumptionCache *AC,
+                     DominatorTree &DT) {
     constexpr unsigned RegMaxNumber = 4;
     constexpr unsigned RedValsMaxNumber = 128;
     // If there are a sufficient number of reduction values, reduce
@@ -24164,9 +24182,7 @@ public:
         // previous vectorization attempts.
         if (any_of(VL, [&V](Value *RedVal) {
               auto *RedValI = dyn_cast<Instruction>(RedVal);
-              if (!RedValI)
-                return false;
-              return V.isDeleted(RedValI);
+              return RedValI && V.isDeleted(RedValI);
             }))
           break;
         V.buildTree(VL, IgnoreList);
@@ -24248,7 +24264,7 @@ public:
 
         // Estimate cost.
         InstructionCost ReductionCost =
-            getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V);
+            getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT, DL, TLI);
         InstructionCost Cost = V.getTreeCost(VL, ReductionCost);
         LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
                           << " for reduction\n");
@@ -24553,7 +24569,9 @@ private:
   InstructionCost getReductionCost(TargetTransformInfo *TTI,
                                    ArrayRef<Value *> ReducedVals,
                                    bool IsCmpSelMinMax, FastMathFlags FMF,
-                                   const BoUpSLP &R) {
+                                   const BoUpSLP &R, DominatorTree &DT,
+                                   const DataLayout &DL,
+                                   const TargetLibraryInfo &TLI) {
     TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
     Type *ScalarTy = ReducedVals.front()->getType();
     unsigned ReduxWidth = ReducedVals.size();
@@ -24578,6 +24596,22 @@ private:
         for (User *U : RdxVal->users()) {
           auto *RdxOp = cast<Instruction>(U);
           if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
+            if (RdxKind == RecurKind::FAdd) {
+              InstructionCost FMACost = canConvertToFMA(
+                  RdxOp, getSameOpcode(RdxOp, TLI), DT, DL, *TTI, TLI);
+              if (FMACost.isValid()) {
+                LLVM_DEBUG(dbgs() << "FMA cost: " << FMACost << "\n");
+                if (auto *I = dyn_cast<Instruction>(RdxVal)) {
+                  // Also, exclude scalar fmul cost.
+                  InstructionCost FMulCost =
+                      TTI->getInstructionCost(I, CostKind);
+                  LLVM_DEBUG(dbgs() << "Minus FMul cost: " << FMulCost << "\n");
+                  FMACost -= FMulCost;
+                }
+                ScalarCost += FMACost;
+                continue;
+              }
+            }
             ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
             continue;
           }
@@ -24642,8 +24676,45 @@ private:
           auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
               std::make_pair(RedTy, true));
           VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
-          VectorCost +=
-              TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind);
+          InstructionCost FMACost = InstructionCost::getInvalid();
+          if (RdxKind == RecurKind::FAdd) {
+            // Check if the reduction operands can be converted to FMA.
+            SmallVector<Value *> Ops;
+            FastMathFlags FMF;
+            FMF.set();
+            for (Value *RdxVal : ReducedVals) {
+              if (!RdxVal->hasOneUse()) {
+                Ops.clear();
+                break;
+              }
+              if (auto *FPCI = dyn_cast<FPMathOperator>(RdxVal))
+                FMF &= FPCI->getFastMathFlags();
+              Ops.push_back(RdxVal->user_back());
+            }
+            if (!Ops.empty()) {
+              FMACost = canConvertToFMA(Ops, getSameOpcode(Ops, TLI), DT, DL,
+                                        *TTI, TLI);
+              if (FMACost.isValid()) {
+                // Calculate actual FMAD cost.
+                IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
+                                            {RVecTy, RVecTy, RVecTy}, FMF);
+                FMACost = TTI->getIntrinsicInstrCost(ICA, CostKind);
+
+                LLVM_DEBUG(dbgs() << "Vector FMA cost: " << FMACost << "\n");
+                // Also, exclude vector fmul cost.
+                InstructionCost FMulCost = TTI->getArithmeticInstrCost(
+                    Instruction::FMul, RVecTy, CostKind);
+                LLVM_DEBUG(dbgs()
+                           << "Minus vector FMul cost: " << FMulCost << "\n");
+                FMACost -= FMulCost;
+              }
+            }
+          }
+          if (FMACost.isValid())
+            VectorCost += FMACost;
+          else
+            VectorCost +=
+                TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind);
           if (RType != RedTy) {
             unsigned Opcode = Instruction::Trunc;
             if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
@@ -25311,7 +25382,7 @@ bool SLPVectorizerPass::vectorizeHorReduction(
     HorizontalReduction HorRdx;
     if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
       return nullptr;
-    return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC);
+    return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
   };
   auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
     if (TryOperandsAsNewSeeds && FutureSeed == Root) {
@@ -25456,7 +25527,7 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
     if (RedCost >= ScalarCost)
       return false;
 
-    return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC) != nullptr;
+    return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) != nullptr;
   };
   if (Candidates.size() == 1)
     return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);
@@ -25540,7 +25611,7 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
 template <typename T>
 static bool tryToVectorizeSequence(
     SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
-    function_ref<bool(T *, T *)> AreCompatible,
+    function_ref<bool(ArrayRef<T *>, T *)> AreCompatible,
     function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
     bool MaxVFOnly, BoUpSLP &R) {
   bool Changed = false;
@@ -25562,7 +25633,7 @@ static bool tryToVectorizeSequence(
     auto *SameTypeIt = IncIt;
     while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
                                R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
-                               AreCompatible(*SameTypeIt, *IncIt))) {
+                               AreCompatible(VL, *SameTypeIt))) {
       auto *I = dyn_cast<Instruction>(*SameTypeIt);
       ++SameTypeIt;
       if (I && !R.isDeleted(I))
@@ -25760,10 +25831,10 @@ bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
     return compareCmp<false>(V, V2, *TLI, *DT);
   };
 
-  auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
-    if (V1 == V2)
+  auto AreCompatibleCompares = [&](ArrayRef<Value *> VL, Value *V1) {
+    if (VL.empty() || VL.back() == V1)
       return true;
-    return compareCmp<true>(V1, V2, *TLI, *DT);
+    return compareCmp<true>(V1, VL.back(), *TLI, *DT);
   };
 
   SmallVector<Value *> Vals;
@@ -25969,9 +26040,11 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
     }
     return false;
   };
-  auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](Value *V1, Value *V2) {
-    if (V1 == V2)
+  auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](ArrayRef<Value *> VL,
+                                                     Value *V1) {
+    if (VL.empty() || V1 == VL.back())
       return true;
+    Value *V2 = VL.back();
     if (V1->getType() != V2->getType())
       return false;
     ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
@@ -26061,7 +26134,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
 
   InstSetVector PostProcessInserts;
   SmallSetVector<CmpInst *, 8> PostProcessCmps;
-  // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
+  // Vectorizes Inserts in `PostProcessInserts` and if `VectorizeCmps` is true
   // also vectorizes `PostProcessCmps`.
   auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
     bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
@@ -26342,7 +26415,13 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
            V2->getValueOperand()->getValueID();
   };
 
-  auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
+  bool SameParent = true;
+  auto AreCompatibleStores = [&](ArrayRef<StoreInst *> VL, StoreInst *V1) {
+    if (VL.empty()) {
+      SameParent = true;
+      return true;
+    }
+    StoreInst *V2 = VL.back();
     if (V1 == V2)
       return true;
     if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
@@ -26353,15 +26432,34 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
     if (isa<UndefValue>(V1->getValueOperand()) ||
         isa<UndefValue>(V2->getValueOperand()))
       return true;
-    if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))
-      if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
-        if (I1->getParent() != I2->getParent())
-          return false;
-        return getSameOpcode({I1, I2}, *TLI).valid();
-      }
     if (isa<Constant>(V1->getValueOperand()) &&
         isa<Constant>(V2->getValueOperand()))
       return true;
+    // Check if the operands of the stores can be vectorized. They can be
+    // vectorized, if they have compatible operands or have operands, which can
+    // be vectorized as copyables.
+    auto *I1 = dyn_cast<Instruction>(V1->getValueOperand());
+    auto *I2 = dyn_cast<Instruction>(V2->getValueOperand());
+    if (I1 || I2) {
+      // Accept only tail-following non-compatible values for now.
+      // TODO: investigate if it is possible to vectorize incompatible values,
+      // if the copyables are first in the list.
+      if (I1 && !I2)
+        return false;
+      SameParent &= I1 && I2 && I1->getParent() == I2->getParent();
+      SmallVector<Value *> NewVL(VL.size() + 1);
+      for (auto [SI, V] : zip(VL, NewVL))
+        V = SI->getValueOperand();
+      NewVL.back() = V1->getValueOperand();
+      InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
+      InstructionsState S = Analysis.buildInstructionsState(
+          NewVL, R, VectorizeCopyableElements, /*WithProfitabilityCheck=*/true,
+          /*SkipSameCodeCheck=*/!SameParent);
+      if (S)
+        return true;
+      if (!SameParent)
+        return false;
+    }
     return V1->getValueOperand()->getValueID() ==
            V2->getValueOperand()->getValueID();
   };
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index f972efa07eb7..16b1b539345d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -45,6 +45,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/LoopVersioning.h"
+#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
 #include <cassert>
 #include <string>
 
@@ -55,6 +56,15 @@ namespace llvm {
 extern cl::opt<bool> EnableVPlanNativePath;
 }
 
+/// @{
+/// Metadata attribute names
+const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
+const char LLVMLoopVectorizeFollowupVectorized[] =
+    "llvm.loop.vectorize.followup_vectorized";
+const char LLVMLoopVectorizeFollowupEpilogue[] =
+    "llvm.loop.vectorize.followup_epilogue";
+/// @}
+
 extern cl::opt<unsigned> ForceTargetInstructionCost;
 
 static cl::opt<bool> PrintVPlansInDotFormat(
@@ -143,7 +153,7 @@ template <typename T> static T *getPlanEntry(T *Start) {
 
   for (unsigned i = 0; i < WorkList.size(); i++) {
     T *Current = WorkList[i];
-    if (Current->getNumPredecessors() == 0)
+    if (!Current->hasPredecessors())
       return Current;
     auto &Predecessors = Current->getPredecessors();
     WorkList.insert_range(Predecessors);
@@ -216,7 +226,7 @@ bool VPBlockUtils::isHeader(const VPBlockBase *VPB,
   // If VPBB is in a region R, VPBB is a loop header if R is a loop region with
   // VPBB as its entry, i.e., free of predecessors.
   if (auto *R = VPBB->getParent())
-    return !R->isReplicator() && VPBB->getNumPredecessors() == 0;
+    return !R->isReplicator() && !VPBB->hasPredecessors();
 
   // A header dominates its second predecessor (the latch), with the other
   // predecessor being the preheader
@@ -493,6 +503,9 @@ void VPBasicBlock::connectToPredecessors(VPTransformState &State) {
 void VPIRBasicBlock::execute(VPTransformState *State) {
   assert(getHierarchicalSuccessors().size() <= 2 &&
          "VPIRBasicBlock can have at most two successors at the moment!");
+  // Move completely disconnected blocks to their final position.
+  if (IRBB->hasNPredecessors(0) && succ_begin(IRBB) == succ_end(IRBB))
+    IRBB->moveAfter(State->CFG.PrevBB);
   State->Builder.SetInsertPoint(IRBB->getTerminator());
   State->CFG.PrevBB = IRBB;
   State->CFG.VPBB2IRBB[this] = IRBB;
@@ -809,7 +822,7 @@ InstructionCost VPBasicBlock::cost(ElementCount VF, VPCostContext &Ctx) {
 
 const VPBasicBlock *VPBasicBlock::getCFGPredecessor(unsigned Idx) const {
   const VPBlockBase *Pred = nullptr;
-  if (getNumPredecessors() > 0) {
+  if (hasPredecessors()) {
     Pred = getPredecessors()[Idx];
   } else {
     auto *Region = getParent();
@@ -1183,14 +1196,14 @@ VPlan *VPlan::duplicate() {
 
   BasicBlock *ScalarHeaderIRBB = getScalarHeader()->getIRBasicBlock();
   VPIRBasicBlock *NewScalarHeader = nullptr;
-  if (getScalarHeader()->getNumPredecessors() == 0) {
-    NewScalarHeader = createVPIRBasicBlock(ScalarHeaderIRBB);
-  } else {
+  if (getScalarHeader()->hasPredecessors()) {
     NewScalarHeader = cast<VPIRBasicBlock>(*find_if(
         vp_depth_first_shallow(NewEntry), [ScalarHeaderIRBB](VPBlockBase *VPB) {
           auto *VPIRBB = dyn_cast<VPIRBasicBlock>(VPB);
           return VPIRBB && VPIRBB->getIRBasicBlock() == ScalarHeaderIRBB;
         }));
+  } else {
+    NewScalarHeader = createVPIRBasicBlock(ScalarHeaderIRBB);
   }
   // Create VPlan, clone live-ins and remap operands in the cloned blocks.
   auto *NewPlan = new VPlan(cast<VPBasicBlock>(NewEntry), NewScalarHeader);
@@ -1473,7 +1486,7 @@ void VPSlotTracker::assignName(const VPValue *V) {
   std::string BaseName = (Twine(Prefix) + Name + Twine(">")).str();
 
   // First assign the base name for V.
-  const auto &[A, _] = VPValue2Name.insert({V, BaseName});
+  const auto &[A, _] = VPValue2Name.try_emplace(V, BaseName);
   // Integer or FP constants with different types will result in he same string
   // due to stripping types.
   if (V->isLiveIn() && isa<ConstantInt, ConstantFP>(UV))
@@ -1481,7 +1494,7 @@ void VPSlotTracker::assignName(const VPValue *V) {
 
   // If it is already used by C > 0 other VPValues, increase the version counter
   // C and use it for V.
-  const auto &[C, UseInserted] = BaseName2Version.insert({BaseName, 0});
+  const auto &[C, UseInserted] = BaseName2Version.try_emplace(BaseName, 0);
   if (!UseInserted) {
     C->second++;
     A->second = (BaseName + Twine(".") + Twine(C->second)).str();
@@ -1612,6 +1625,123 @@ VPlan &LoopVectorizationPlanner::getPlanFor(ElementCount VF) const {
   llvm_unreachable("No plan found!");
 }
 
+static void addRuntimeUnrollDisableMetaData(Loop *L) {
+  SmallVector<Metadata *, 4> MDs;
+  // Reserve first location for self reference to the LoopID metadata node.
+  MDs.push_back(nullptr);
+  bool IsUnrollMetadata = false;
+  MDNode *LoopID = L->getLoopID();
+  if (LoopID) {
+    // First find existing loop unrolling disable metadata.
+    for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) {
+      auto *MD = dyn_cast<MDNode>(LoopID->getOperand(I));
+      if (MD) {
+        const auto *S = dyn_cast<MDString>(MD->getOperand(0));
+        if (!S)
+          continue;
+        if (S->getString().starts_with("llvm.loop.unroll.runtime.disable"))
+          continue;
+        IsUnrollMetadata =
+            S->getString().starts_with("llvm.loop.unroll.disable");
+      }
+      MDs.push_back(LoopID->getOperand(I));
+    }
+  }
+
+  if (!IsUnrollMetadata) {
+    // Add runtime unroll disable metadata.
+    LLVMContext &Context = L->getHeader()->getContext();
+    SmallVector<Metadata *, 1> DisableOperands;
+    DisableOperands.push_back(
+        MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
+    MDNode *DisableNode = MDNode::get(Context, DisableOperands);
+    MDs.push_back(DisableNode);
+    MDNode *NewLoopID = MDNode::get(Context, MDs);
+    // Set operand 0 to refer to the loop id itself.
+    NewLoopID->replaceOperandWith(0, NewLoopID);
+    L->setLoopID(NewLoopID);
+  }
+}
+
+void LoopVectorizationPlanner::updateLoopMetadataAndProfileInfo(
+    Loop *VectorLoop, VPBasicBlock *HeaderVPBB, bool VectorizingEpilogue,
+    unsigned EstimatedVFxUF, bool DisableRuntimeUnroll) {
+  MDNode *LID = OrigLoop->getLoopID();
+  // Update the metadata of the scalar loop. Skip the update when vectorizing
+  // the epilogue loop, to ensure it is only updated once.
+  if (!VectorizingEpilogue) {
+    std::optional<MDNode *> RemainderLoopID = makeFollowupLoopID(
+        LID, {LLVMLoopVectorizeFollowupAll, LLVMLoopVectorizeFollowupEpilogue});
+    if (RemainderLoopID) {
+      OrigLoop->setLoopID(*RemainderLoopID);
+    } else {
+      if (DisableRuntimeUnroll)
+        addRuntimeUnrollDisableMetaData(OrigLoop);
+
+      LoopVectorizeHints Hints(OrigLoop, true, *ORE);
+      Hints.setAlreadyVectorized();
+    }
+  }
+
+  if (!VectorLoop)
+    return;
+
+  if (std::optional<MDNode *> VectorizedLoopID =
+          makeFollowupLoopID(LID, {LLVMLoopVectorizeFollowupAll,
+                                   LLVMLoopVectorizeFollowupVectorized})) {
+    VectorLoop->setLoopID(*VectorizedLoopID);
+  } else {
+    // Keep all loop hints from the original loop on the vector loop (we'll
+    // replace the vectorizer-specific hints below).
+    if (LID)
+      VectorLoop->setLoopID(LID);
+
+    if (!VectorizingEpilogue) {
+      LoopVectorizeHints Hints(VectorLoop, true, *ORE);
+      Hints.setAlreadyVectorized();
+    }
+
+    // Check if it's EVL-vectorized and mark the corresponding metadata.
+    bool IsEVLVectorized =
+        llvm::any_of(*HeaderVPBB, [](const VPRecipeBase &Recipe) {
+          // Looking for the ExplictVectorLength VPInstruction.
+          if (const auto *VI = dyn_cast<VPInstruction>(&Recipe))
+            return VI->getOpcode() == VPInstruction::ExplicitVectorLength;
+          return false;
+        });
+    if (IsEVLVectorized) {
+      LLVMContext &Context = VectorLoop->getHeader()->getContext();
+      MDNode *LoopID = VectorLoop->getLoopID();
+      auto *IsEVLVectorizedMD = MDNode::get(
+          Context,
+          {MDString::get(Context, "llvm.loop.isvectorized.tailfoldingstyle"),
+           MDString::get(Context, "evl")});
+      MDNode *NewLoopID = makePostTransformationMetadata(Context, LoopID, {},
+                                                         {IsEVLVectorizedMD});
+      VectorLoop->setLoopID(NewLoopID);
+    }
+  }
+  TargetTransformInfo::UnrollingPreferences UP;
+  TTI.getUnrollingPreferences(VectorLoop, *PSE.getSE(), UP, ORE);
+  if (!UP.UnrollVectorizedLoop || VectorizingEpilogue)
+    addRuntimeUnrollDisableMetaData(VectorLoop);
+
+  // Set/update profile weights for the vector and remainder loops as original
+  // loop iterations are now distributed among them. Note that original loop
+  // becomes the scalar remainder loop after vectorization.
+  //
+  // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
+  // end up getting slightly roughened result but that should be OK since
+  // profile is not inherently precise anyway. Note also possible bypass of
+  // vector code caused by legality checks is ignored, assigning all the weight
+  // to the vector loop, optimistically.
+  //
+  // For scalable vectorization we can't know at compile time how many
+  // iterations of the loop are handled in one vector iteration, so instead
+  // use the value of vscale used for tuning.
+  setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop, EstimatedVFxUF);
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
   if (VPlans.empty()) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index d6bc462a0dfa..53291a931530 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -219,6 +219,9 @@ public:
   size_t getNumSuccessors() const { return Successors.size(); }
   size_t getNumPredecessors() const { return Predecessors.size(); }
 
+  /// Returns true if this block has any predecessors.
+  bool hasPredecessors() const { return !Predecessors.empty(); }
+
   /// An Enclosing Block of a block B is any block containing B, including B
   /// itself. \return the closest enclosing block starting from "this", which
   /// has successors. \return the root enclosing block if all enclosing blocks
@@ -400,7 +403,7 @@ class LLVM_ABI_FOR_TEST VPRecipeBase
 
 public:
   VPRecipeBase(const unsigned char SC, ArrayRef<VPValue *> Operands,
-               DebugLoc DL = {})
+               DebugLoc DL = DebugLoc::getUnknown())
       : VPDef(SC), VPUser(Operands), DL(DL) {}
 
   virtual ~VPRecipeBase() = default;
@@ -518,11 +521,11 @@ protected:
 class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
 public:
   VPSingleDefRecipe(const unsigned char SC, ArrayRef<VPValue *> Operands,
-                    DebugLoc DL = {})
+                    DebugLoc DL = DebugLoc::getUnknown())
       : VPRecipeBase(SC, Operands, DL), VPValue(this) {}
 
   VPSingleDefRecipe(const unsigned char SC, ArrayRef<VPValue *> Operands,
-                    Value *UV, DebugLoc DL = {})
+                    Value *UV, DebugLoc DL = DebugLoc::getUnknown())
       : VPRecipeBase(SC, Operands, DL), VPValue(this, UV) {}
 
   static inline bool classof(const VPRecipeBase *R) {
@@ -557,6 +560,7 @@ public:
     case VPRecipeBase::VPPartialReductionSC:
       return true;
     case VPRecipeBase::VPBranchOnMaskSC:
+    case VPRecipeBase::VPInterleaveEVLSC:
     case VPRecipeBase::VPInterleaveSC:
     case VPRecipeBase::VPIRInstructionSC:
     case VPRecipeBase::VPWidenLoadEVLSC:
@@ -712,12 +716,15 @@ public:
   VPIRFlags(GEPNoWrapFlags GEPFlags)
       : OpType(OperationType::GEPOp), GEPFlags(GEPFlags) {}
 
-public:
   void transferFlags(VPIRFlags &Other) {
     OpType = Other.OpType;
     AllFlags = Other.AllFlags;
   }
 
+  /// Only keep flags also present in \p Other. \p Other must have the same
+  /// OpType as the current object.
+  void intersectFlags(const VPIRFlags &Other);
+
   /// Drop all poison-generating flags.
   void dropPoisonGeneratingFlags() {
     // NOTE: This needs to be kept in-sync with
@@ -864,7 +871,7 @@ public:
 /// using IR flags.
 struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
   VPRecipeWithIRFlags(const unsigned char SC, ArrayRef<VPValue *> Operands,
-                      DebugLoc DL = {})
+                      DebugLoc DL = DebugLoc::getUnknown())
       : VPSingleDefRecipe(SC, Operands, DL), VPIRFlags() {}
 
   VPRecipeWithIRFlags(const unsigned char SC, ArrayRef<VPValue *> Operands,
@@ -872,7 +879,8 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
       : VPSingleDefRecipe(SC, Operands, &I, I.getDebugLoc()), VPIRFlags(I) {}
 
   VPRecipeWithIRFlags(const unsigned char SC, ArrayRef<VPValue *> Operands,
-                      const VPIRFlags &Flags, DebugLoc DL = {})
+                      const VPIRFlags &Flags,
+                      DebugLoc DL = DebugLoc::getUnknown())
       : VPSingleDefRecipe(SC, Operands, DL), VPIRFlags(Flags) {}
 
   static inline bool classof(const VPRecipeBase *R) {
@@ -900,6 +908,11 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
     return R && classof(R);
   }
 
+  static inline bool classof(const VPSingleDefRecipe *U) {
+    auto *R = dyn_cast<VPRecipeBase>(U);
+    return R && classof(R);
+  }
+
   void execute(VPTransformState &State) override = 0;
 
   /// Compute the cost for this recipe for \p VF, using \p Opcode and \p Ctx.
@@ -975,6 +988,10 @@ public:
     Not,
     SLPLoad,
     SLPStore,
+    // Creates a mask where each lane is active (true) whilst the current
+    // counter (first operand + index) is less than the second operand. i.e.
+    //    mask[i] = icmpt ult (op0 + i), op1
+    // The size of the mask returned is VF * Multiplier (UF, third op).
     ActiveLaneMask,
     ExplicitVectorLength,
     CalculateTripCountMinusVF,
@@ -1014,7 +1031,8 @@ public:
     // Returns a scalar boolean value, which is true if any lane of its
     // (boolean) vector operands is true. It produces the reduced value across
     // all unrolled iterations. Unrolling will add all copies of its original
-    // operand as additional operands.
+    // operand as additional operands. AnyOf is poison-safe as all operands
+    // will be frozen.
     AnyOf,
     // Calculates the first active lane index of the vector predicate operands.
     // It produces the lane index across all unrolled iterations. Unrolling will
@@ -1080,13 +1098,13 @@ private:
 #endif
 
 public:
-  VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL = {},
-                const Twine &Name = "")
+  VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
+                DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "")
       : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, DL),
         VPIRMetadata(), Opcode(Opcode), Name(Name.str()) {}
 
   VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
-                const VPIRFlags &Flags, DebugLoc DL = {},
+                const VPIRFlags &Flags, DebugLoc DL = DebugLoc::getUnknown(),
                 const Twine &Name = "");
 
   VP_CLASSOF_IMPL(VPDef::VPInstructionSC)
@@ -1479,7 +1497,8 @@ public:
   }
 
   VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,
-                    const VPIRFlags &Flags = {}, DebugLoc DL = {})
+                    const VPIRFlags &Flags = {},
+                    DebugLoc DL = DebugLoc::getUnknown())
       : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, Flags, DL),
         VPIRMetadata(), Opcode(Opcode), ResultTy(ResultTy) {
     assert(flagsValidForOpcode(Opcode) &&
@@ -1537,7 +1556,7 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
 public:
   VPWidenIntrinsicRecipe(CallInst &CI, Intrinsic::ID VectorIntrinsicID,
                          ArrayRef<VPValue *> CallArguments, Type *Ty,
-                         DebugLoc DL = {})
+                         DebugLoc DL = DebugLoc::getUnknown())
       : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, CI),
         VPIRMetadata(CI), VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty),
         MayReadFromMemory(CI.mayReadFromMemory()),
@@ -1546,7 +1565,7 @@ public:
 
   VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID,
                          ArrayRef<VPValue *> CallArguments, Type *Ty,
-                         DebugLoc DL = {})
+                         DebugLoc DL = DebugLoc::getUnknown())
       : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, DL),
         VPIRMetadata(), VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty) {
     LLVMContext &Ctx = Ty->getContext();
@@ -1615,7 +1634,8 @@ class LLVM_ABI_FOR_TEST VPWidenCallRecipe : public VPRecipeWithIRFlags,
 
 public:
   VPWidenCallRecipe(Value *UV, Function *Variant,
-                    ArrayRef<VPValue *> CallArguments, DebugLoc DL = {})
+                    ArrayRef<VPValue *> CallArguments,
+                    DebugLoc DL = DebugLoc::getUnknown())
       : VPRecipeWithIRFlags(VPDef::VPWidenCallSC, CallArguments,
                             *cast<Instruction>(UV)),
         VPIRMetadata(*cast<Instruction>(UV)), Variant(Variant) {
@@ -1644,10 +1664,8 @@ public:
     return cast<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue());
   }
 
-  operand_range args() { return make_range(op_begin(), std::prev(op_end())); }
-  const_operand_range args() const {
-    return make_range(op_begin(), std::prev(op_end()));
-  }
+  operand_range args() { return drop_end(operands()); }
+  const_operand_range args() const { return drop_end(operands()); }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
@@ -1667,7 +1685,7 @@ class VPHistogramRecipe : public VPRecipeBase {
 
 public:
   VPHistogramRecipe(unsigned Opcode, ArrayRef<VPValue *> Operands,
-                    DebugLoc DL = {})
+                    DebugLoc DL = DebugLoc::getUnknown())
       : VPRecipeBase(VPDef::VPHistogramSC, Operands, DL), Opcode(Opcode) {}
 
   ~VPHistogramRecipe() override = default;
@@ -1998,6 +2016,9 @@ public:
     return getOperand(1);
   }
 
+  /// Update the incoming value from the loop backedge.
+  void setBackedgeValue(VPValue *V) { setOperand(1, V); }
+
   /// Returns the backedge value as a recipe. The backedge value is guaranteed
   /// to be a recipe.
   virtual VPRecipeBase &getBackedgeRecipe() {
@@ -2229,8 +2250,8 @@ protected:
 public:
   /// Create a new VPWidenPHIRecipe for \p Phi with start value \p Start and
   /// debug location \p DL.
-  VPWidenPHIRecipe(PHINode *Phi, VPValue *Start = nullptr, DebugLoc DL = {},
-                   const Twine &Name = "")
+  VPWidenPHIRecipe(PHINode *Phi, VPValue *Start = nullptr,
+                   DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "")
       : VPSingleDefRecipe(VPDef::VPWidenPHISC, ArrayRef<VPValue *>(), Phi, DL),
         Name(Name.str()) {
     if (Start)
@@ -2381,9 +2402,8 @@ public:
   }
 
   VPBlendRecipe *clone() override {
-    SmallVector<VPValue *> Ops(operands());
-    return new VPBlendRecipe(cast_or_null<PHINode>(getUnderlyingValue()), Ops,
-                             getDebugLoc());
+    return new VPBlendRecipe(cast_or_null<PHINode>(getUnderlyingValue()),
+                             operands(), getDebugLoc());
   }
 
   VP_CLASSOF_IMPL(VPDef::VPBlendSC)
@@ -2409,6 +2429,12 @@ public:
     return Idx == 0 ? getOperand(1) : getOperand(Idx * 2 + !isNormalized());
   }
 
+  /// Set mask number \p Idx to \p V.
+  void setMask(unsigned Idx, VPValue *V) {
+    assert((Idx > 0 || !isNormalized()) && "First index has no mask!");
+    Idx == 0 ? setOperand(1, V) : setOperand(Idx * 2 + !isNormalized(), V);
+  }
+
   void execute(VPTransformState &State) override {
     llvm_unreachable("VPBlendRecipe should be expanded by simplifyBlends");
   }
@@ -2434,12 +2460,13 @@ public:
   }
 };
 
-/// VPInterleaveRecipe is a recipe for transforming an interleave group of load
-/// or stores into one wide load/store and shuffles. The first operand of a
-/// VPInterleave recipe is the address, followed by the stored values, followed
-/// by an optional mask.
-class LLVM_ABI_FOR_TEST VPInterleaveRecipe : public VPRecipeBase,
-                                             public VPIRMetadata {
+/// A common base class for interleaved memory operations.
+/// An Interleaved memory operation is a memory access method that combines
+/// multiple strided loads/stores into a single wide load/store with shuffles.
+/// The first operand is the start address. The optional operands are, in order,
+/// the stored values and the mask.
+class LLVM_ABI_FOR_TEST VPInterleaveBase : public VPRecipeBase,
+                                           public VPIRMetadata {
   const InterleaveGroup<Instruction> *IG;
 
   /// Indicates if the interleave group is in a conditional block and requires a
@@ -2450,12 +2477,14 @@ class LLVM_ABI_FOR_TEST VPInterleaveRecipe : public VPRecipeBase,
   /// unusued gaps can be loaded speculatively.
   bool NeedsMaskForGaps = false;
 
-public:
-  VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr,
-                     ArrayRef<VPValue *> StoredValues, VPValue *Mask,
-                     bool NeedsMaskForGaps, const VPIRMetadata &MD, DebugLoc DL)
-      : VPRecipeBase(VPDef::VPInterleaveSC, {Addr}, DL), VPIRMetadata(MD),
-        IG(IG), NeedsMaskForGaps(NeedsMaskForGaps) {
+protected:
+  VPInterleaveBase(const unsigned char SC,
+                   const InterleaveGroup<Instruction> *IG,
+                   ArrayRef<VPValue *> Operands,
+                   ArrayRef<VPValue *> StoredValues, VPValue *Mask,
+                   bool NeedsMaskForGaps, const VPIRMetadata &MD, DebugLoc DL)
+      : VPRecipeBase(SC, Operands, DL), VPIRMetadata(MD), IG(IG),
+        NeedsMaskForGaps(NeedsMaskForGaps) {
     // TODO: extend the masked interleaved-group support to reversed access.
     assert((!Mask || !IG->isReverse()) &&
            "Reversed masked interleave-group not supported.");
@@ -2473,14 +2502,19 @@ public:
       addOperand(Mask);
     }
   }
-  ~VPInterleaveRecipe() override = default;
 
-  VPInterleaveRecipe *clone() override {
-    return new VPInterleaveRecipe(IG, getAddr(), getStoredValues(), getMask(),
-                                  NeedsMaskForGaps, *this, getDebugLoc());
+public:
+  VPInterleaveBase *clone() override = 0;
+
+  static inline bool classof(const VPRecipeBase *R) {
+    return R->getVPDefID() == VPRecipeBase::VPInterleaveSC ||
+           R->getVPDefID() == VPRecipeBase::VPInterleaveEVLSC;
   }
 
-  VP_CLASSOF_IMPL(VPDef::VPInterleaveSC)
+  static inline bool classof(const VPUser *U) {
+    auto *R = dyn_cast<VPRecipeBase>(U);
+    return R && classof(R);
+  }
 
   /// Return the address accessed by this recipe.
   VPValue *getAddr() const {
@@ -2490,48 +2524,130 @@ public:
   /// Return the mask used by this recipe. Note that a full mask is represented
   /// by a nullptr.
   VPValue *getMask() const {
-    // Mask is optional and therefore the last, currently 2nd operand.
+    // Mask is optional and the last operand.
     return HasMask ? getOperand(getNumOperands() - 1) : nullptr;
   }
 
+  /// Return true if the access needs a mask because of the gaps.
+  bool needsMaskForGaps() const { return NeedsMaskForGaps; }
+
+  const InterleaveGroup<Instruction> *getInterleaveGroup() const { return IG; }
+
+  Instruction *getInsertPos() const { return IG->getInsertPos(); }
+
+  void execute(VPTransformState &State) override {
+    llvm_unreachable("VPInterleaveBase should not be instantiated.");
+  }
+
+  /// Return the cost of this recipe.
+  InstructionCost computeCost(ElementCount VF,
+                              VPCostContext &Ctx) const override;
+
+  /// Returns true if the recipe only uses the first lane of operand \p Op.
+  virtual bool onlyFirstLaneUsed(const VPValue *Op) const override = 0;
+
+  /// Returns the number of stored operands of this interleave group. Returns 0
+  /// for load interleave groups.
+  virtual unsigned getNumStoreOperands() const = 0;
+
   /// Return the VPValues stored by this interleave group. If it is a load
   /// interleave group, return an empty ArrayRef.
   ArrayRef<VPValue *> getStoredValues() const {
-    // The first operand is the address, followed by the stored values, followed
-    // by an optional mask.
-    return ArrayRef<VPValue *>(op_begin(), getNumOperands())
-        .slice(1, getNumStoreOperands());
+    return ArrayRef<VPValue *>(op_end() -
+                                   (getNumStoreOperands() + (HasMask ? 1 : 0)),
+                               getNumStoreOperands());
+  }
+};
+
+/// VPInterleaveRecipe is a recipe for transforming an interleave group of load
+/// or stores into one wide load/store and shuffles. The first operand of a
+/// VPInterleave recipe is the address, followed by the stored values, followed
+/// by an optional mask.
+class LLVM_ABI_FOR_TEST VPInterleaveRecipe final : public VPInterleaveBase {
+public:
+  VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr,
+                     ArrayRef<VPValue *> StoredValues, VPValue *Mask,
+                     bool NeedsMaskForGaps, const VPIRMetadata &MD, DebugLoc DL)
+      : VPInterleaveBase(VPDef::VPInterleaveSC, IG, Addr, StoredValues, Mask,
+                         NeedsMaskForGaps, MD, DL) {}
+
+  ~VPInterleaveRecipe() override = default;
+
+  VPInterleaveRecipe *clone() override {
+    return new VPInterleaveRecipe(getInterleaveGroup(), getAddr(),
+                                  getStoredValues(), getMask(),
+                                  needsMaskForGaps(), *this, getDebugLoc());
   }
 
+  VP_CLASSOF_IMPL(VPDef::VPInterleaveSC)
+
   /// Generate the wide load or store, and shuffles.
   void execute(VPTransformState &State) override;
 
-  /// Return the cost of this VPInterleaveRecipe.
-  InstructionCost computeCost(ElementCount VF,
-                              VPCostContext &Ctx) const override;
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
 #endif
 
-  const InterleaveGroup<Instruction> *getInterleaveGroup() { return IG; }
+  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+    assert(is_contained(operands(), Op) &&
+           "Op must be an operand of the recipe");
+    return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op);
+  }
 
-  /// Returns the number of stored operands of this interleave group. Returns 0
-  /// for load interleave groups.
-  unsigned getNumStoreOperands() const {
-    return getNumOperands() - (HasMask ? 2 : 1);
+  unsigned getNumStoreOperands() const override {
+    return getNumOperands() - (getMask() ? 2 : 1);
   }
+};
+
+/// A recipe for interleaved memory operations with vector-predication
+/// intrinsics. The first operand is the address, the second operand is the
+/// explicit vector length. Stored values and mask are optional operands.
+class LLVM_ABI_FOR_TEST VPInterleaveEVLRecipe final : public VPInterleaveBase {
+public:
+  VPInterleaveEVLRecipe(VPInterleaveRecipe &R, VPValue &EVL, VPValue *Mask)
+      : VPInterleaveBase(VPDef::VPInterleaveEVLSC, R.getInterleaveGroup(),
+                         ArrayRef<VPValue *>({R.getAddr(), &EVL}),
+                         R.getStoredValues(), Mask, R.needsMaskForGaps(), R,
+                         R.getDebugLoc()) {
+    assert(!getInterleaveGroup()->isReverse() &&
+           "Reversed interleave-group with tail folding is not supported.");
+    assert(!needsMaskForGaps() && "Interleaved access with gap mask is not "
+                                  "supported for scalable vector.");
+  }
+
+  ~VPInterleaveEVLRecipe() override = default;
+
+  VPInterleaveEVLRecipe *clone() override {
+    llvm_unreachable("cloning not implemented yet");
+  }
+
+  VP_CLASSOF_IMPL(VPDef::VPInterleaveEVLSC)
+
+  /// The VPValue of the explicit vector length.
+  VPValue *getEVL() const { return getOperand(1); }
 
-  /// The recipe only uses the first lane of the address.
+  /// Generate the wide load or store, and shuffles.
+  void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+
+  /// The recipe only uses the first lane of the address, and EVL operand.
   bool onlyFirstLaneUsed(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
-    return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op);
+    return (Op == getAddr() && !llvm::is_contained(getStoredValues(), Op)) ||
+           Op == getEVL();
   }
 
-  Instruction *getInsertPos() const { return IG->getInsertPos(); }
+  unsigned getNumStoreOperands() const override {
+    return getNumOperands() - (getMask() ? 3 : 2);
+  }
 };
 
 /// A recipe to represent inloop reduction operations, performing a reduction on
@@ -2561,14 +2677,14 @@ protected:
 public:
   VPReductionRecipe(RecurKind RdxKind, FastMathFlags FMFs, Instruction *I,
                     VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
-                    bool IsOrdered, DebugLoc DL = {})
+                    bool IsOrdered, DebugLoc DL = DebugLoc::getUnknown())
       : VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, I,
                           ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp,
                           IsOrdered, DL) {}
 
   VPReductionRecipe(const RecurKind RdxKind, FastMathFlags FMFs,
                     VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
-                    bool IsOrdered, DebugLoc DL = {})
+                    bool IsOrdered, DebugLoc DL = DebugLoc::getUnknown())
       : VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, nullptr,
                           ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp,
                           IsOrdered, DL) {}
@@ -2686,7 +2802,7 @@ public:
 class LLVM_ABI_FOR_TEST VPReductionEVLRecipe : public VPReductionRecipe {
 public:
   VPReductionEVLRecipe(VPReductionRecipe &R, VPValue &EVL, VPValue *CondOp,
-                       DebugLoc DL = {})
+                       DebugLoc DL = DebugLoc::getUnknown())
       : VPReductionRecipe(
             VPDef::VPReductionEVLSC, R.getRecurrenceKind(),
             R.getFastMathFlags(),
@@ -3537,7 +3653,8 @@ public:
         InductionOpcode(Opcode) {}
 
   VPScalarIVStepsRecipe(const InductionDescriptor &IndDesc, VPValue *IV,
-                        VPValue *Step, VPValue *VF, DebugLoc DL = {})
+                        VPValue *Step, VPValue *VF,
+                        DebugLoc DL = DebugLoc::getUnknown())
       : VPScalarIVStepsRecipe(
             IV, Step, VF, IndDesc.getInductionOpcode(),
             dyn_cast_or_null<FPMathOperator>(IndDesc.getInductionBinOp())
@@ -4142,7 +4259,7 @@ public:
   /// Returns an iterator range over all VFs of the plan.
   iterator_range<SmallSetVector<ElementCount, 2>::iterator>
   vectorFactors() const {
-    return {VFs.begin(), VFs.end()};
+    return VFs;
   }
 
   bool hasScalarVFOnly() const {
@@ -4299,9 +4416,8 @@ public:
   /// via the other early exit).
   bool hasEarlyExit() const {
     return count_if(ExitBlocks,
-                    [](VPIRBasicBlock *EB) {
-                      return EB->getNumPredecessors() != 0;
-                    }) > 1 ||
+                    [](VPIRBasicBlock *EB) { return EB->hasPredecessors(); }) >
+               1 ||
            (ExitBlocks.size() == 1 && ExitBlocks[0]->getNumPredecessors() > 1);
   }
 
@@ -4309,7 +4425,7 @@ public:
   /// that this relies on unneeded branches to the scalar tail loop being
   /// removed.
   bool hasScalarTail() const {
-    return !(getScalarPreheader()->getNumPredecessors() == 0 ||
+    return !(!getScalarPreheader()->hasPredecessors() ||
              getScalarPreheader()->getSinglePredecessor() == getEntry());
   }
 };
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 747c6623aa22..d400ceff7797 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -296,7 +296,7 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
           .Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPReplicateRecipe,
                 VPWidenCallRecipe, VPWidenMemoryRecipe, VPWidenSelectRecipe>(
               [this](const auto *R) { return inferScalarTypeForRecipe(R); })
-          .Case<VPInterleaveRecipe>([V](const VPInterleaveRecipe *R) {
+          .Case<VPInterleaveBase>([V](const auto *R) {
             // TODO: Use info from interleave group.
             return V->getUnderlyingValue()->getType();
           })
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 80b48de57b40..cef91c15dd87 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -193,6 +193,9 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
     }
 
     if (auto *SI = dyn_cast<SwitchInst>(Inst)) {
+      // Don't emit recipes for unconditional switch instructions.
+      if (SI->getNumCases() == 0)
+        continue;
       SmallVector<VPValue *> Ops = {getOrCreateVPOperand(SI->getCondition())};
       for (auto Case : SI->cases())
         Ops.push_back(getOrCreateVPOperand(Case.getCaseValue()));
@@ -538,8 +541,7 @@ VPlanTransforms::buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy,
 }
 
 void VPlanTransforms::handleEarlyExits(VPlan &Plan,
-                                       bool HasUncountableEarlyExit,
-                                       VFRange &Range) {
+                                       bool HasUncountableEarlyExit) {
   auto *MiddleVPBB = cast<VPBasicBlock>(
       Plan.getScalarHeader()->getSinglePredecessor()->getPredecessors()[0]);
   auto *LatchVPBB = cast<VPBasicBlock>(MiddleVPBB->getSinglePredecessor());
@@ -559,8 +561,7 @@ void VPlanTransforms::handleEarlyExits(VPlan &Plan,
         assert(!HandledUncountableEarlyExit &&
                "can handle exactly one uncountable early exit");
         handleUncountableEarlyExit(cast<VPBasicBlock>(Pred), EB, Plan,
-                                   cast<VPBasicBlock>(HeaderVPB), LatchVPBB,
-                                   Range);
+                                   cast<VPBasicBlock>(HeaderVPB), LatchVPBB);
         HandledUncountableEarlyExit = true;
       } else {
         for (VPRecipeBase &R : EB->phis())
@@ -671,6 +672,90 @@ void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
   }
 }
 
+void VPlanTransforms::addMinimumIterationCheck(
+    VPlan &Plan, ElementCount VF, unsigned UF,
+    ElementCount MinProfitableTripCount, bool RequiresScalarEpilogue,
+    bool TailFolded, bool CheckNeededWithTailFolding, Loop *OrigLoop,
+    const uint32_t *MinItersBypassWeights, DebugLoc DL, ScalarEvolution &SE) {
+  // Generate code to check if the loop's trip count is less than VF * UF, or
+  // equal to it in case a scalar epilogue is required; this implies that the
+  // vector trip count is zero. This check also covers the case where adding one
+  // to the backedge-taken count overflowed leading to an incorrect trip count
+  // of zero. In this case we will also jump to the scalar loop.
+  CmpInst::Predicate CmpPred =
+      RequiresScalarEpilogue ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
+  // If tail is to be folded, vector loop takes care of all iterations.
+  VPValue *TripCountVPV = Plan.getTripCount();
+  const SCEV *TripCount = vputils::getSCEVExprForVPValue(TripCountVPV, SE);
+  Type *TripCountTy = TripCount->getType();
+  auto GetMinTripCount = [&]() -> const SCEV * {
+    // Compute max(MinProfitableTripCount, UF * VF) and return it.
+    const SCEV *VFxUF =
+        SE.getElementCount(TripCountTy, (VF * UF), SCEV::FlagNUW);
+    if (UF * VF.getKnownMinValue() >=
+        MinProfitableTripCount.getKnownMinValue()) {
+      // TODO: SCEV should be able to simplify test.
+      return VFxUF;
+    }
+    const SCEV *MinProfitableTripCountSCEV =
+        SE.getElementCount(TripCountTy, MinProfitableTripCount, SCEV::FlagNUW);
+    return SE.getUMaxExpr(MinProfitableTripCountSCEV, VFxUF);
+  };
+
+  VPBasicBlock *EntryVPBB = Plan.getEntry();
+  VPBuilder Builder(EntryVPBB);
+  VPValue *TripCountCheck = Plan.getFalse();
+  const SCEV *Step = GetMinTripCount();
+  if (TailFolded) {
+    if (CheckNeededWithTailFolding) {
+      // vscale is not necessarily a power-of-2, which means we cannot guarantee
+      // an overflow to zero when updating induction variables and so an
+      // additional overflow check is required before entering the vector loop.
+
+      // Get the maximum unsigned value for the type.
+      VPValue *MaxUIntTripCount = Plan.getOrAddLiveIn(ConstantInt::get(
+          TripCountTy, cast<IntegerType>(TripCountTy)->getMask()));
+      VPValue *DistanceToMax = Builder.createNaryOp(
+          Instruction::Sub, {MaxUIntTripCount, TripCountVPV},
+          DebugLoc::getUnknown());
+
+      // Don't execute the vector loop if (UMax - n) < (VF * UF).
+      // FIXME: Should only check VF * UF, but currently checks Step=max(VF*UF,
+      // minProfitableTripCount).
+      TripCountCheck = Builder.createICmp(ICmpInst::ICMP_ULT, DistanceToMax,
+                                          Builder.createExpandSCEV(Step), DL);
+    } else {
+      // TripCountCheck = false, folding tail implies positive vector trip
+      // count.
+    }
+  } else {
+    // TODO: Emit unconditional branch to vector preheader instead of
+    // conditional branch with known condition.
+    TripCount = SE.applyLoopGuards(TripCount, OrigLoop);
+    // Check if the trip count is < the step.
+    if (SE.isKnownPredicate(CmpPred, TripCount, Step)) {
+      // TODO: Ensure step is at most the trip count when determining max VF and
+      // UF, w/o tail folding.
+      TripCountCheck = Plan.getTrue();
+    } else if (!SE.isKnownPredicate(CmpInst::getInversePredicate(CmpPred),
+                                    TripCount, Step)) {
+      // Generate the minimum iteration check only if we cannot prove the
+      // check is known to be true, or known to be false.
+      VPValue *MinTripCountVPV = Builder.createExpandSCEV(Step);
+      TripCountCheck = Builder.createICmp(
+          CmpPred, TripCountVPV, MinTripCountVPV, DL, "min.iters.check");
+    } // else step known to be < trip count, use TripCountCheck preset to false.
+  }
+  VPInstruction *Term =
+      Builder.createNaryOp(VPInstruction::BranchOnCond, {TripCountCheck}, DL);
+  if (MinItersBypassWeights) {
+    MDBuilder MDB(Plan.getContext());
+    MDNode *BranchWeights = MDB.createBranchWeights(
+        ArrayRef(MinItersBypassWeights, 2), /*IsExpected=*/false);
+    Term->addMetadata(LLVMContext::MD_prof, BranchWeights);
+  }
+}
+
 bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
   auto GetMinMaxCompareValue = [](VPReductionPHIRecipe *RedPhiR) -> VPValue * {
     auto *MinMaxR = dyn_cast<VPRecipeWithIRFlags>(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index 1ec6ae677374..109156c1469c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -145,6 +145,16 @@ inline int_pred_ty<is_all_ones> m_AllOnes() {
   return int_pred_ty<is_all_ones>();
 }
 
+struct is_zero_int {
+  bool isValue(const APInt &C) const { return C.isZero(); }
+};
+
+/// Match an integer 0 or a vector with all elements equal to 0.
+/// For vectors, this includes constants with undefined elements.
+inline int_pred_ty<is_zero_int> m_ZeroInt() {
+  return int_pred_ty<is_zero_int>();
+}
+
 /// Matching combinators
 template <typename LTy, typename RTy> struct match_combine_or {
   LTy L;
@@ -218,9 +228,12 @@ struct Recipe_match {
     if ((!matchRecipeAndOpcode<RecipeTys>(R) && ...))
       return false;
 
-    assert(R->getNumOperands() == std::tuple_size<Ops_t>::value &&
-           "recipe with matched opcode does not have the expected number of "
-           "operands");
+    if (R->getNumOperands() != std::tuple_size<Ops_t>::value) {
+      assert(Opcode == Instruction::PHI &&
+             "non-variadic recipe with matched opcode does not have the "
+             "expected number of operands");
+      return false;
+    }
 
     auto IdxSeq = std::make_index_sequence<std::tuple_size<Ops_t>::value>();
     if (all_of_tuple_elements(IdxSeq, [R](auto Op, unsigned Idx) {
@@ -302,14 +315,21 @@ m_Broadcast(const Op0_t &Op0) {
 }
 
 template <typename Op0_t>
+inline VPInstruction_match<VPInstruction::ExplicitVectorLength, Op0_t>
+m_EVL(const Op0_t &Op0) {
+  return m_VPInstruction<VPInstruction::ExplicitVectorLength>(Op0);
+}
+
+template <typename Op0_t>
 inline VPInstruction_match<VPInstruction::ExtractLastElement, Op0_t>
 m_ExtractLastElement(const Op0_t &Op0) {
   return m_VPInstruction<VPInstruction::ExtractLastElement>(Op0);
 }
-template <typename Op0_t, typename Op1_t>
-inline VPInstruction_match<VPInstruction::ActiveLaneMask, Op0_t, Op1_t>
-m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1) {
-  return m_VPInstruction<VPInstruction::ActiveLaneMask>(Op0, Op1);
+
+template <typename Op0_t, typename Op1_t, typename Op2_t>
+inline VPInstruction_match<VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t>
+m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) {
+  return m_VPInstruction<VPInstruction::ActiveLaneMask>(Op0, Op1, Op2);
 }
 
 template <typename Op0_t, typename Op1_t>
@@ -345,6 +365,12 @@ m_ZExtOrSExt(const Op0_t &Op0) {
   return m_CombineOr(m_ZExt(Op0), m_SExt(Op0));
 }
 
+template <typename Op0_t>
+inline match_combine_or<AllRecipe_match<Instruction::ZExt, Op0_t>, Op0_t>
+m_ZExtOrSelf(const Op0_t &Op0) {
+  return m_CombineOr(m_ZExt(Op0), Op0);
+}
+
 template <unsigned Opcode, typename Op0_t, typename Op1_t>
 inline AllRecipe_match<Opcode, Op0_t, Op1_t> m_Binary(const Op0_t &Op0,
                                                       const Op1_t &Op1) {
@@ -381,6 +407,13 @@ m_c_Mul(const Op0_t &Op0, const Op1_t &Op1) {
   return m_c_Binary<Instruction::Mul, Op0_t, Op1_t>(Op0, Op1);
 }
 
+/// Match a binary AND operation.
+template <typename Op0_t, typename Op1_t>
+inline AllRecipe_commutative_match<Instruction::And, Op0_t, Op1_t>
+m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1) {
+  return m_c_Binary<Instruction::And, Op0_t, Op1_t>(Op0, Op1);
+}
+
 /// Match a binary OR operation. Note that while conceptually the operands can
 /// be matched commutatively, \p Commutative defaults to false in line with the
 /// IR-based pattern matching infrastructure. Use m_c_BinaryOr for a commutative
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index cdadc33e3088..0c27d535b680 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -14,11 +14,13 @@
 #include "VPRecipeBuilder.h"
 #include "VPlan.h"
 #include "VPlanCFG.h"
+#include "VPlanPatternMatch.h"
 #include "VPlanTransforms.h"
 #include "VPlanUtils.h"
 #include "llvm/ADT/PostOrderIterator.h"
 
 using namespace llvm;
+using namespace VPlanPatternMatch;
 
 namespace {
 class VPPredicator {
@@ -246,6 +248,7 @@ void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) {
                "Distinct incoming values with one having a full mask");
         break;
       }
+
       OperandsWithMask.push_back(EdgeMask);
     }
     PHINode *IRPhi = cast_or_null<PHINode>(PhiR->getUnderlyingValue());
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index c4fdcccc6d62..bf5148954309 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -52,8 +52,9 @@ bool VPRecipeBase::mayWriteToMemory() const {
     return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
   case VPInstructionSC:
     return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
+  case VPInterleaveEVLSC:
   case VPInterleaveSC:
-    return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
+    return cast<VPInterleaveBase>(this)->getNumStoreOperands() > 0;
   case VPWidenStoreEVLSC:
   case VPWidenStoreSC:
     return true;
@@ -142,6 +143,7 @@ bool VPRecipeBase::mayReadFromMemory() const {
     return false;
   }
   default:
+    // FIXME: Return false if the recipe represents an interleaved store.
     return true;
   }
 }
@@ -183,6 +185,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
            "underlying instruction has side-effects");
     return false;
   }
+  case VPInterleaveEVLSC:
   case VPInterleaveSC:
     return mayWriteToMemory();
   case VPWidenLoadEVLSC:
@@ -255,7 +258,7 @@ InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
   Instruction *UI = nullptr;
   if (auto *S = dyn_cast<VPSingleDefRecipe>(this))
     UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
-  else if (auto *IG = dyn_cast<VPInterleaveRecipe>(this))
+  else if (auto *IG = dyn_cast<VPInterleaveBase>(this))
     UI = IG->getInsertPos();
   else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this))
     UI = &WidenMem->getIngredient();
@@ -389,6 +392,42 @@ void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
+void VPIRFlags::intersectFlags(const VPIRFlags &Other) {
+  assert(OpType == Other.OpType && "OpType must match");
+  switch (OpType) {
+  case OperationType::OverflowingBinOp:
+    WrapFlags.HasNUW &= Other.WrapFlags.HasNUW;
+    WrapFlags.HasNSW &= Other.WrapFlags.HasNSW;
+    break;
+  case OperationType::Trunc:
+    TruncFlags.HasNUW &= Other.TruncFlags.HasNUW;
+    TruncFlags.HasNSW &= Other.TruncFlags.HasNSW;
+    break;
+  case OperationType::DisjointOp:
+    DisjointFlags.IsDisjoint &= Other.DisjointFlags.IsDisjoint;
+    break;
+  case OperationType::PossiblyExactOp:
+    ExactFlags.IsExact &= Other.ExactFlags.IsExact;
+    break;
+  case OperationType::GEPOp:
+    GEPFlags &= Other.GEPFlags;
+    break;
+  case OperationType::FPMathOp:
+    FMFs.NoNaNs &= Other.FMFs.NoNaNs;
+    FMFs.NoInfs &= Other.FMFs.NoInfs;
+    break;
+  case OperationType::NonNegOp:
+    NonNegFlags.NonNeg &= Other.NonNegFlags.NonNeg;
+    break;
+  case OperationType::Cmp:
+    assert(CmpPredicate == Other.CmpPredicate && "Cannot drop CmpPredicate");
+    break;
+  case OperationType::Other:
+    assert(AllFlags == Other.AllFlags && "Cannot drop other flags");
+    break;
+  }
+}
+
 FastMathFlags VPIRFlags::getFastMathFlags() const {
   assert(OpType == OperationType::FPMathOp &&
          "recipe doesn't have fast math flags");
@@ -471,7 +510,6 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
   case Instruction::ICmp:
   case Instruction::FCmp:
   case Instruction::Store:
-  case VPInstruction::ActiveLaneMask:
   case VPInstruction::BranchOnCount:
   case VPInstruction::ComputeReductionResult:
   case VPInstruction::FirstOrderRecurrenceSplice:
@@ -481,6 +519,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
   case VPInstruction::WideIVStep:
     return 2;
   case Instruction::Select:
+  case VPInstruction::ActiveLaneMask:
   case VPInstruction::ComputeAnyOfResult:
   case VPInstruction::ReductionStartVector:
     return 3;
@@ -620,7 +659,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
                                Name);
 
     auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
-    auto *PredTy = VectorType::get(Int1Ty, State.VF);
+    auto PredTy = VectorType::get(
+        Int1Ty, State.VF * cast<ConstantInt>(getOperand(2)->getLiveInIRValue())
+                               ->getZExtValue());
     return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
                                    {PredTy, ScalarTC->getType()},
                                    {VIVElem0, ScalarTC}, nullptr, Name);
@@ -875,9 +916,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
     return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
   }
   case VPInstruction::AnyOf: {
-    Value *Res = State.get(getOperand(0));
+    Value *Res = Builder.CreateFreeze(State.get(getOperand(0)));
     for (VPValue *Op : drop_begin(operands()))
-      Res = Builder.CreateOr(Res, State.get(Op));
+      Res = Builder.CreateOr(Res, Builder.CreateFreeze(State.get(Op)));
     return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res);
   }
   case VPInstruction::ExtractLane: {
@@ -919,8 +960,15 @@ Value *VPInstruction::generate(VPTransformState &State) {
     unsigned LastOpIdx = getNumOperands() - 1;
     Value *Res = nullptr;
     for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
-      Value *TrailingZeros = Builder.CreateCountTrailingZeroElems(
-          Builder.getInt64Ty(), State.get(getOperand(Idx)), true, Name);
+      Value *TrailingZeros =
+          State.VF.isScalar()
+              ? Builder.CreateZExt(
+                    Builder.CreateICmpEQ(State.get(getOperand(Idx)),
+                                         Builder.getFalse()),
+                    Builder.getInt64Ty())
+              : Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(),
+                                                     State.get(getOperand(Idx)),
+                                                     true, Name);
       Value *Current = Builder.CreateAdd(
           Builder.CreateMul(RuntimeVF, Builder.getInt64(Idx)), TrailingZeros);
       if (Res) {
@@ -1027,8 +1075,27 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
   }
 
   switch (getOpcode()) {
+  case Instruction::Select: {
+    // TODO: It may be possible to improve this by analyzing where the
+    // condition operand comes from.
+    CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
+    auto *CondTy = Ctx.Types.inferScalarType(getOperand(0));
+    auto *VecTy = Ctx.Types.inferScalarType(getOperand(1));
+    if (!vputils::onlyFirstLaneUsed(this)) {
+      CondTy = toVectorTy(CondTy, VF);
+      VecTy = toVectorTy(VecTy, VF);
+    }
+    return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VecTy, CondTy, Pred,
+                                      Ctx.CostKind);
+  }
   case Instruction::ExtractElement:
   case VPInstruction::ExtractLane: {
+    if (VF.isScalar()) {
+      // ExtractLane with VF=1 takes care of handling extracting across multiple
+      // parts.
+      return 0;
+    }
+
     // Add on the cost of extracting the element.
     auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
     return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
@@ -1040,8 +1107,13 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
         Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
   }
   case VPInstruction::FirstActiveLane: {
+    Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
+    if (VF.isScalar())
+      return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
+                                        CmpInst::makeCmpResultType(ScalarTy),
+                                        CmpInst::ICMP_EQ, Ctx.CostKind);
     // Calculate the cost of determining the lane index.
-    auto *PredTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
+    auto *PredTy = toVectorTy(ScalarTy, VF);
     IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
                                   Type::getInt64Ty(Ctx.LLVMCtx),
                                   {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
@@ -1060,7 +1132,9 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
   }
   case VPInstruction::ActiveLaneMask: {
     Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0));
-    Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
+    unsigned Multiplier =
+        cast<ConstantInt>(getOperand(2)->getLiveInIRValue())->getZExtValue();
+    Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF * Multiplier);
     IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
                                   {ArgTy, ArgTy});
     return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
@@ -1684,18 +1758,22 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
     State.set(this, V);
 }
 
-InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
-                                                    VPCostContext &Ctx) const {
+/// Compute the cost for the intrinsic \p ID with \p Operands, produced by \p R.
+static InstructionCost getCostForIntrinsics(Intrinsic::ID ID,
+                                            ArrayRef<const VPValue *> Operands,
+                                            const VPRecipeWithIRFlags &R,
+                                            ElementCount VF,
+                                            VPCostContext &Ctx) {
   // Some backends analyze intrinsic arguments to determine cost. Use the
   // underlying value for the operand if it has one. Otherwise try to use the
   // operand of the underlying call instruction, if there is one. Otherwise
   // clear Arguments.
   // TODO: Rework TTI interface to be independent of concrete IR values.
   SmallVector<const Value *> Arguments;
-  for (const auto &[Idx, Op] : enumerate(operands())) {
+  for (const auto &[Idx, Op] : enumerate(Operands)) {
     auto *V = Op->getUnderlyingValue();
     if (!V) {
-      if (auto *UI = dyn_cast_or_null<CallBase>(getUnderlyingValue())) {
+      if (auto *UI = dyn_cast_or_null<CallBase>(R.getUnderlyingValue())) {
         Arguments.push_back(UI->getArgOperand(Idx));
         continue;
       }
@@ -1705,21 +1783,31 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
     Arguments.push_back(V);
   }
 
-  Type *RetTy = toVectorizedTy(Ctx.Types.inferScalarType(this), VF);
+  Type *ScalarRetTy = Ctx.Types.inferScalarType(&R);
+  Type *RetTy = VF.isVector() ? toVectorizedTy(ScalarRetTy, VF) : ScalarRetTy;
   SmallVector<Type *> ParamTys;
-  for (unsigned I = 0; I != getNumOperands(); ++I)
-    ParamTys.push_back(
-        toVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF));
+  for (const VPValue *Op : Operands) {
+    ParamTys.push_back(VF.isVector()
+                           ? toVectorTy(Ctx.Types.inferScalarType(Op), VF)
+                           : Ctx.Types.inferScalarType(Op));
+  }
 
   // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
-  FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags();
+  FastMathFlags FMF =
+      R.hasFastMathFlags() ? R.getFastMathFlags() : FastMathFlags();
   IntrinsicCostAttributes CostAttrs(
-      VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF,
-      dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()),
+      ID, RetTy, Arguments, ParamTys, FMF,
+      dyn_cast_or_null<IntrinsicInst>(R.getUnderlyingValue()),
       InstructionCost::getInvalid(), &Ctx.TLI);
   return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
 }
 
+InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
+                                                    VPCostContext &Ctx) const {
+  SmallVector<const VPValue *> ArgOps(operands());
+  return getCostForIntrinsics(VectorIntrinsicID, ArgOps, *this, VF, Ctx);
+}
+
 StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
   return Intrinsic::getBaseName(VectorIntrinsicID);
 }
@@ -2110,8 +2198,10 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
   case Instruction::SDiv:
   case Instruction::SRem:
   case Instruction::URem:
-    // More complex computation, let the legacy cost-model handle this for now.
-    return Ctx.getLegacyCost(cast<Instruction>(getUnderlyingValue()), VF);
+    // If the div/rem operation isn't safe to speculate and requires
+    // predication, then the only way we can even create a vplan is to insert
+    // a select on the second input operand to ensure we use the value of 1
+    // for the inactive lanes. The select will be costed separately.
   case Instruction::FNeg:
   case Instruction::Add:
   case Instruction::FAdd:
@@ -2174,7 +2264,7 @@ InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
   auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
     if (VF.isScalar())
       return TTI::CastContextHint::Normal;
-    if (isa<VPInterleaveRecipe>(R))
+    if (isa<VPInterleaveBase>(R))
       return TTI::CastContextHint::Interleave;
     if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R))
       return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
@@ -2756,10 +2846,10 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
       toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF));
   assert(RedTy->isIntegerTy() &&
          "VPExpressionRecipe only supports integer types currently.");
+  unsigned Opcode = RecurrenceDescriptor::getOpcode(
+      cast<VPReductionRecipe>(ExpressionRecipes.back())->getRecurrenceKind());
   switch (ExpressionType) {
   case ExpressionTypes::ExtendedReduction: {
-    unsigned Opcode = RecurrenceDescriptor::getOpcode(
-        cast<VPReductionRecipe>(ExpressionRecipes[1])->getRecurrenceKind());
     return Ctx.TTI.getExtendedReductionCost(
         Opcode,
         cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
@@ -2767,13 +2857,14 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
         RedTy, SrcVecTy, std::nullopt, Ctx.CostKind);
   }
   case ExpressionTypes::MulAccReduction:
-    return Ctx.TTI.getMulAccReductionCost(false, RedTy, SrcVecTy, Ctx.CostKind);
+    return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy,
+                                          Ctx.CostKind);
 
   case ExpressionTypes::ExtMulAccReduction:
     return Ctx.TTI.getMulAccReductionCost(
         cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
             Instruction::ZExt,
-        RedTy, SrcVecTy, Ctx.CostKind);
+        Opcode, RedTy, SrcVecTy, Ctx.CostKind);
   }
   llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum");
 }
@@ -3014,23 +3105,75 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
     // instruction cost.
     return 0;
   case Instruction::Call: {
-    if (!isSingleScalar()) {
-      // TODO: Handle remaining call costs here as well.
-      if (VF.isScalable())
-        return InstructionCost::getInvalid();
-      break;
-    }
-
     auto *CalledFn =
         cast<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue());
-    if (CalledFn->isIntrinsic())
-      break;
 
+    SmallVector<const VPValue *> ArgOps(drop_end(operands()));
     SmallVector<Type *, 4> Tys;
-    for (VPValue *ArgOp : drop_end(operands()))
+    for (const VPValue *ArgOp : ArgOps)
       Tys.push_back(Ctx.Types.inferScalarType(ArgOp));
+
+    if (CalledFn->isIntrinsic())
+      // Various pseudo-intrinsics with costs of 0 are scalarized instead of
+      // vectorized via VPWidenIntrinsicRecipe. Return 0 for them early.
+      switch (CalledFn->getIntrinsicID()) {
+      case Intrinsic::assume:
+      case Intrinsic::lifetime_end:
+      case Intrinsic::lifetime_start:
+      case Intrinsic::sideeffect:
+      case Intrinsic::pseudoprobe:
+      case Intrinsic::experimental_noalias_scope_decl: {
+        assert(getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
+                                    ElementCount::getFixed(1), Ctx) == 0 &&
+               "scalarizing intrinsic should be free");
+        return InstructionCost(0);
+      }
+      default:
+        break;
+      }
+
     Type *ResultTy = Ctx.Types.inferScalarType(this);
-    return Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind);
+    InstructionCost ScalarCallCost =
+        Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind);
+    if (isSingleScalar()) {
+      if (CalledFn->isIntrinsic())
+        ScalarCallCost = std::min(
+            ScalarCallCost,
+            getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
+                                 ElementCount::getFixed(1), Ctx));
+      return ScalarCallCost;
+    }
+
+    if (VF.isScalable())
+      return InstructionCost::getInvalid();
+
+    // Compute the cost of scalarizing the result and operands if needed.
+    InstructionCost ScalarizationCost = 0;
+    if (VF.isVector()) {
+      if (!ResultTy->isVoidTy()) {
+        for (Type *VectorTy :
+             to_vector(getContainedTypes(toVectorizedTy(ResultTy, VF)))) {
+          ScalarizationCost += Ctx.TTI.getScalarizationOverhead(
+              cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getFixedValue()),
+              /*Insert=*/true,
+              /*Extract=*/false, Ctx.CostKind);
+        }
+      }
+      // Skip operands that do not require extraction/scalarization and do not
+      // incur any overhead.
+      SmallPtrSet<const VPValue *, 4> UniqueOperands;
+      Tys.clear();
+      for (auto *Op : ArgOps) {
+        if (Op->isLiveIn() || isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op) ||
+            !UniqueOperands.insert(Op).second)
+          continue;
+        Tys.push_back(toVectorizedTy(Ctx.Types.inferScalarType(Op), VF));
+      }
+      ScalarizationCost +=
+          Ctx.TTI.getOperandsScalarizationOverhead(Tys, Ctx.CostKind);
+    }
+
+    return ScalarCallCost * VF.getFixedValue() + ScalarizationCost;
   }
   case Instruction::Add:
   case Instruction::Sub:
@@ -3045,10 +3188,29 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
   case Instruction::AShr:
   case Instruction::And:
   case Instruction::Or:
-  case Instruction::Xor: {
+  case Instruction::Xor:
+  case Instruction::ICmp:
+  case Instruction::FCmp:
     return *getCostForRecipeWithOpcode(getOpcode(), ElementCount::getFixed(1),
                                        Ctx) *
            (isSingleScalar() ? 1 : VF.getFixedValue());
+  case Instruction::Load:
+  case Instruction::Store: {
+    if (isSingleScalar()) {
+      bool IsLoad = UI->getOpcode() == Instruction::Load;
+      Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
+      Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1));
+      const Align Alignment = getLoadStoreAlignment(UI);
+      unsigned AS = getLoadStoreAddressSpace(UI);
+      TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
+      InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
+          UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI);
+      return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
+                                   ScalarPtrTy, nullptr, nullptr, Ctx.CostKind);
+    }
+    // TODO: See getMemInstScalarizationCost for how to handle replicating and
+    // predicated cases.
+    break;
   }
   }
 
@@ -3181,10 +3343,17 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
     // TODO: Using the original IR may not be accurate.
     // Currently, ARM will use the underlying IR to calculate gather/scatter
     // instruction cost.
-    const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
-    Type *PtrTy = toVectorTy(Ptr->getType(), VF);
     assert(!Reverse &&
            "Inconsecutive memory access should not have the order.");
+
+    const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
+    Type *PtrTy = Ptr->getType();
+
+    // If the address value is uniform across all lanes, then the address can be
+    // calculated with scalar type and broadcast.
+    if (!vputils::isSingleScalar(getAddr()))
+      PtrTy = toVectorTy(PtrTy, VF);
+
     return Ctx.TTI.getAddressComputationCost(PtrTy, nullptr, nullptr,
                                              Ctx.CostKind) +
            Ctx.TTI.getGatherScatterOpCost(Opcode, Ty, Ptr, IsMasked, Alignment,
@@ -3532,9 +3701,9 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
 void VPInterleaveRecipe::execute(VPTransformState &State) {
   assert(!State.Lane && "Interleave group being replicated.");
-  assert((!NeedsMaskForGaps || !State.VF.isScalable()) &&
+  assert((!needsMaskForGaps() || !State.VF.isScalable()) &&
          "Masking gaps for scalable vectors is not yet supported.");
-  const InterleaveGroup<Instruction> *Group = IG;
+  const InterleaveGroup<Instruction> *Group = getInterleaveGroup();
   Instruction *Instr = Group->getInsertPos();
 
   // Prepare for the vector type of the interleaved load/store.
@@ -3574,7 +3743,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
   // Vectorize the interleaved load group.
   if (isa<LoadInst>(Instr)) {
     Value *MaskForGaps = nullptr;
-    if (NeedsMaskForGaps) {
+    if (needsMaskForGaps()) {
       MaskForGaps =
           createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group);
       assert(MaskForGaps && "Mask for Gaps is required but it is null");
@@ -3651,7 +3820,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
   // Vectorize the interleaved store group.
   Value *MaskForGaps =
       createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
-  assert(((MaskForGaps != nullptr) == NeedsMaskForGaps) &&
+  assert(((MaskForGaps != nullptr) == needsMaskForGaps()) &&
          "Mismatch between NeedsMaskForGaps and MaskForGaps");
   ArrayRef<VPValue *> StoredValues = getStoredValues();
   // Collect the stored vector from each member.
@@ -3702,6 +3871,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
                                VPSlotTracker &SlotTracker) const {
+  const InterleaveGroup<Instruction> *IG = getInterleaveGroup();
   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
   IG->getInsertPos()->printAsOperand(O, false);
   O << ", ";
@@ -3730,8 +3900,152 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
-InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF,
-                                                VPCostContext &Ctx) const {
+void VPInterleaveEVLRecipe::execute(VPTransformState &State) {
+  assert(!State.Lane && "Interleave group being replicated.");
+  assert(State.VF.isScalable() &&
+         "Only support scalable VF for EVL tail-folding.");
+  assert(!needsMaskForGaps() &&
+         "Masking gaps for scalable vectors is not yet supported.");
+  const InterleaveGroup<Instruction> *Group = getInterleaveGroup();
+  Instruction *Instr = Group->getInsertPos();
+
+  // Prepare for the vector type of the interleaved load/store.
+  Type *ScalarTy = getLoadStoreType(Instr);
+  unsigned InterleaveFactor = Group->getFactor();
+  assert(InterleaveFactor <= 8 &&
+         "Unsupported deinterleave/interleave factor for scalable vectors");
+  ElementCount WideVF = State.VF * InterleaveFactor;
+  auto *VecTy = VectorType::get(ScalarTy, WideVF);
+
+  VPValue *Addr = getAddr();
+  Value *ResAddr = State.get(Addr, VPLane(0));
+  Value *EVL = State.get(getEVL(), VPLane(0));
+  Value *InterleaveEVL = State.Builder.CreateMul(
+      EVL, ConstantInt::get(EVL->getType(), InterleaveFactor), "interleave.evl",
+      /* NUW= */ true, /* NSW= */ true);
+  LLVMContext &Ctx = State.Builder.getContext();
+
+  Value *GroupMask = nullptr;
+  if (VPValue *BlockInMask = getMask()) {
+    SmallVector<Value *> Ops(InterleaveFactor, State.get(BlockInMask));
+    GroupMask = interleaveVectors(State.Builder, Ops, "interleaved.mask");
+  } else {
+    GroupMask =
+        State.Builder.CreateVectorSplat(WideVF, State.Builder.getTrue());
+  }
+
+  // Vectorize the interleaved load group.
+  if (isa<LoadInst>(Instr)) {
+    CallInst *NewLoad = State.Builder.CreateIntrinsic(
+        VecTy, Intrinsic::vp_load, {ResAddr, GroupMask, InterleaveEVL}, nullptr,
+        "wide.vp.load");
+    NewLoad->addParamAttr(0,
+                          Attribute::getWithAlignment(Ctx, Group->getAlign()));
+
+    applyMetadata(*NewLoad);
+    // TODO: Also manage existing metadata using VPIRMetadata.
+    Group->addMetadata(NewLoad);
+
+    // Scalable vectors cannot use arbitrary shufflevectors (only splats),
+    // so must use intrinsics to deinterleave.
+    NewLoad = State.Builder.CreateIntrinsic(
+        Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
+        NewLoad->getType(), NewLoad,
+        /*FMFSource=*/nullptr, "strided.vec");
+
+    const DataLayout &DL = Instr->getDataLayout();
+    for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
+      Instruction *Member = Group->getMember(I);
+      // Skip the gaps in the group.
+      if (!Member)
+        continue;
+
+      Value *StridedVec = State.Builder.CreateExtractValue(NewLoad, I);
+      // If this member has different type, cast the result type.
+      if (Member->getType() != ScalarTy) {
+        VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
+        StridedVec =
+            createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
+      }
+
+      State.set(getVPValue(J), StridedVec);
+      ++J;
+    }
+    return;
+  } // End for interleaved load.
+
+  // The sub vector type for current instruction.
+  auto *SubVT = VectorType::get(ScalarTy, State.VF);
+  // Vectorize the interleaved store group.
+  ArrayRef<VPValue *> StoredValues = getStoredValues();
+  // Collect the stored vector from each member.
+  SmallVector<Value *, 4> StoredVecs;
+  const DataLayout &DL = Instr->getDataLayout();
+  for (unsigned I = 0, StoredIdx = 0; I < InterleaveFactor; I++) {
+    Instruction *Member = Group->getMember(I);
+    // Skip the gaps in the group.
+    if (!Member) {
+      StoredVecs.push_back(PoisonValue::get(SubVT));
+      continue;
+    }
+
+    Value *StoredVec = State.get(StoredValues[StoredIdx]);
+    // If this member has different type, cast it to a unified type.
+    if (StoredVec->getType() != SubVT)
+      StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
+
+    StoredVecs.push_back(StoredVec);
+    ++StoredIdx;
+  }
+
+  // Interleave all the smaller vectors into one wider vector.
+  Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
+  CallInst *NewStore =
+      State.Builder.CreateIntrinsic(Type::getVoidTy(Ctx), Intrinsic::vp_store,
+                                    {IVec, ResAddr, GroupMask, InterleaveEVL});
+  NewStore->addParamAttr(1,
+                         Attribute::getWithAlignment(Ctx, Group->getAlign()));
+
+  applyMetadata(*NewStore);
+  // TODO: Also manage existing metadata using VPIRMetadata.
+  Group->addMetadata(NewStore);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPInterleaveEVLRecipe::print(raw_ostream &O, const Twine &Indent,
+                                  VPSlotTracker &SlotTracker) const {
+  const InterleaveGroup<Instruction> *IG = getInterleaveGroup();
+  O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
+  IG->getInsertPos()->printAsOperand(O, false);
+  O << ", ";
+  getAddr()->printAsOperand(O, SlotTracker);
+  O << ", ";
+  getEVL()->printAsOperand(O, SlotTracker);
+  if (VPValue *Mask = getMask()) {
+    O << ", ";
+    Mask->printAsOperand(O, SlotTracker);
+  }
+
+  unsigned OpIdx = 0;
+  for (unsigned i = 0; i < IG->getFactor(); ++i) {
+    if (!IG->getMember(i))
+      continue;
+    if (getNumStoreOperands() > 0) {
+      O << "\n" << Indent << "  vp.store ";
+      getOperand(2 + OpIdx)->printAsOperand(O, SlotTracker);
+      O << " to index " << i;
+    } else {
+      O << "\n" << Indent << "  ";
+      getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
+      O << " = vp.load from index " << i;
+    }
+    ++OpIdx;
+  }
+}
+#endif
+
+InstructionCost VPInterleaveBase::computeCost(ElementCount VF,
+                                              VPCostContext &Ctx) const {
   Instruction *InsertPos = getInsertPos();
   // Find the VPValue index of the interleave group. We need to skip gaps.
   unsigned InsertPosIdx = 0;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index e0bf241c73fd..2cac5557daee 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -29,6 +29,7 @@
 #include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/Analysis/InstSimplifyFolder.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/MDBuilder.h"
@@ -39,6 +40,10 @@
 using namespace llvm;
 using namespace VPlanPatternMatch;
 
+cl::opt<bool> EnableWideActiveLaneMask(
+    "enable-wide-lane-mask", cl::init(false), cl::Hidden,
+    cl::desc("Enable use of wide get active lane mask instructions"));
+
 bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
     VPlanPtr &Plan,
     function_ref<const InductionDescriptor *(PHINode *)>
@@ -142,7 +147,7 @@ static bool sinkScalarOperands(VPlan &Plan) {
       for (VPValue *Op : Recipe.operands())
         if (auto *Def =
                 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
-          WorkList.insert(std::make_pair(VPBB, Def));
+          WorkList.insert({VPBB, Def});
     }
   }
 
@@ -206,7 +211,7 @@ static bool sinkScalarOperands(VPlan &Plan) {
     for (VPValue *Op : SinkCandidate->operands())
       if (auto *Def =
               dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
-        WorkList.insert(std::make_pair(SinkTo, Def));
+        WorkList.insert({SinkTo, Def});
     Changed = true;
   }
   return Changed;
@@ -344,7 +349,7 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
   auto *BlockInMask = PredRecipe->getMask();
   auto *MaskDef = BlockInMask->getDefiningRecipe();
   auto *BOMRecipe = new VPBranchOnMaskRecipe(
-      BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc());
+      BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
   auto *Entry =
       Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
 
@@ -859,8 +864,8 @@ static VPValue *optimizeLatchExitInductionUser(
     Type *StepTy = TypeInfo.inferScalarType(Step);
     auto *Zero = Plan.getOrAddLiveIn(ConstantInt::get(StepTy, 0));
     return B.createPtrAdd(EndValue,
-                          B.createNaryOp(Instruction::Sub, {Zero, Step}), {},
-                          "ind.escape");
+                          B.createNaryOp(Instruction::Sub, {Zero, Step}),
+                          DebugLoc::getUnknown(), "ind.escape");
   }
   if (ScalarTy->isFloatingPointTy()) {
     const auto &ID = WideIV->getInductionDescriptor();
@@ -910,10 +915,10 @@ static void removeRedundantExpandSCEVRecipes(VPlan &Plan) {
     if (!ExpR)
       continue;
 
-    auto I = SCEV2VPV.insert({ExpR->getSCEV(), ExpR});
-    if (I.second)
+    const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
+    if (Inserted)
       continue;
-    ExpR->replaceAllUsesWith(I.first->second);
+    ExpR->replaceAllUsesWith(V->second);
     ExpR->eraseFromParent();
   }
 }
@@ -1067,7 +1072,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
   // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
   // && (Y || Z) and (X || !X) into true. This requires queuing newly created
   // recipes to be visited during simplification.
-  VPValue *X, *Y;
+  VPValue *X, *Y, *Z;
   if (match(Def,
             m_c_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)),
                          m_LogicalAnd(m_Deferred(X), m_Not(m_Deferred(Y)))))) {
@@ -1076,13 +1081,37 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
     return;
   }
 
-  // OR x, 1 -> 1.
-  if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes()))) {
-    Def->replaceAllUsesWith(Def->getOperand(0) == X ? Def->getOperand(1)
-                                                    : Def->getOperand(0));
-    Def->eraseFromParent();
-    return;
-  }
+  // x | 1 -> 1
+  if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
+    return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
+
+  // x | 0 -> x
+  if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
+    return Def->replaceAllUsesWith(X);
+
+  // x & 0 -> 0
+  if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
+    return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
+
+  // x && false -> false
+  if (match(Def, m_LogicalAnd(m_VPValue(X), m_False())))
+    return Def->replaceAllUsesWith(Def->getOperand(1));
+
+  // (x && y) || (x && z) -> x && (y || z)
+  VPBuilder Builder(Def);
+  if (match(Def, m_c_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)),
+                              m_LogicalAnd(m_Deferred(X), m_VPValue(Z)))) &&
+      // Simplify only if one of the operands has one use to avoid creating an
+      // extra recipe.
+      (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
+       !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
+    return Def->replaceAllUsesWith(
+        Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
+
+  // x && !x -> 0
+  if (match(&R, m_LogicalAnd(m_VPValue(X), m_Not(m_Deferred(X)))))
+    return Def->replaceAllUsesWith(Plan->getOrAddLiveIn(
+        ConstantInt::getFalse(VPTypeAnalysis(*Plan).inferScalarType(Def))));
 
   if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
     return Def->replaceAllUsesWith(X);
@@ -1096,6 +1125,15 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
     return;
   }
 
+  // Reassociate (x && y) && z -> x && (y && z) if x has multiple users. With
+  // tail folding it is likely that x is a header mask and can be simplified
+  // further.
+  if (match(Def, m_LogicalAnd(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)),
+                              m_VPValue(Z))) &&
+      X->hasMoreThanOneUniqueUser())
+    return Def->replaceAllUsesWith(
+        Builder.createLogicalAnd(X, Builder.createLogicalAnd(Y, Z)));
+
   if (match(Def, m_c_Mul(m_VPValue(A), m_SpecificInt(1))))
     return Def->replaceAllUsesWith(A);
 
@@ -1150,7 +1188,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
                      m_VPValue(X), m_SpecificInt(1)))) {
     Type *WideStepTy = TypeInfo.inferScalarType(Def);
     if (TypeInfo.inferScalarType(X) != WideStepTy)
-      X = VPBuilder(Def).createWidenCast(Instruction::Trunc, X, WideStepTy);
+      X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
     Def->replaceAllUsesWith(X);
     return;
   }
@@ -1232,11 +1270,12 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
     return;
   }
 
-  VPInstruction *OpVPI;
-  if (match(Def, m_ExtractLastElement(m_VPInstruction(OpVPI))) &&
-      OpVPI->isVectorToScalar()) {
-    Def->replaceAllUsesWith(OpVPI);
-    return;
+  if (match(Def,
+            m_VPInstruction<VPInstruction::ExtractLastElement>(m_VPValue(A))) &&
+      vputils::isSingleScalar(A) && all_of(A->users(), [Def, A](VPUser *U) {
+        return U->usesScalars(A) || Def == U;
+      })) {
+    return Def->replaceAllUsesWith(A);
   }
 }
 
@@ -1269,11 +1308,29 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
         continue;
 
       auto *RepOrWidenR = cast<VPSingleDefRecipe>(&R);
+      if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
+          vputils::isSingleScalar(RepR->getOperand(1))) {
+        auto *Clone = new VPReplicateRecipe(
+            RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
+            true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Metadata*/);
+        Clone->insertBefore(RepOrWidenR);
+        auto *Ext = new VPInstruction(VPInstruction::ExtractLastElement,
+                                      {Clone->getOperand(0)});
+        Ext->insertBefore(Clone);
+        Clone->setOperand(0, Ext);
+        RepR->eraseFromParent();
+        continue;
+      }
+
       // Skip recipes that aren't single scalars or don't have only their
       // scalar results used. In the latter case, we would introduce extra
       // broadcasts.
       if (!vputils::isSingleScalar(RepOrWidenR) ||
-          !vputils::onlyScalarValuesUsed(RepOrWidenR))
+          !all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) {
+            return U->usesScalars(RepOrWidenR) ||
+                   match(cast<VPRecipeBase>(U),
+                         m_ExtractLastElement(m_VPValue()));
+          }))
         continue;
 
       auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(),
@@ -1285,6 +1342,23 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
   }
 }
 
+/// Try to see if all of \p Blend's masks share a common value logically and'ed
+/// and remove it from the masks.
+static void removeCommonBlendMask(VPBlendRecipe *Blend) {
+  if (Blend->isNormalized())
+    return;
+  VPValue *CommonEdgeMask;
+  if (!match(Blend->getMask(0),
+             m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
+    return;
+  for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
+    if (!match(Blend->getMask(I),
+               m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
+      return;
+  for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
+    Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
+}
+
 /// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
 /// to make sure the masks are simplified.
 static void simplifyBlends(VPlan &Plan) {
@@ -1295,6 +1369,8 @@ static void simplifyBlends(VPlan &Plan) {
       if (!Blend)
         continue;
 
+      removeCommonBlendMask(Blend);
+
       // Try to remove redundant blend recipes.
       SmallPtrSet<VPValue *, 4> UniqueValues;
       if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
@@ -1467,6 +1543,102 @@ static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan,
   return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
 }
 
+/// Try to replace multiple active lane masks used for control flow with
+/// a single, wide active lane mask instruction followed by multiple
+/// extract subvector intrinsics. This applies to the active lane mask
+/// instructions both in the loop and in the preheader.
+/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
+/// new extracts from the first active lane mask, which has it's last
+/// operand (multiplier) set to UF.
+static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF,
+                                       unsigned UF) {
+  if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
+    return false;
+
+  VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
+  VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
+  auto *Term = &ExitingVPBB->back();
+
+  using namespace llvm::VPlanPatternMatch;
+  if (!match(Term, m_BranchOnCond(m_Not(m_ActiveLaneMask(
+                       m_VPValue(), m_VPValue(), m_VPValue())))))
+    return false;
+
+  auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
+  LLVMContext &Ctx = Plan.getContext();
+
+  auto ExtractFromALM = [&](VPInstruction *ALM,
+                            SmallVectorImpl<VPValue *> &Extracts) {
+    DebugLoc DL = ALM->getDebugLoc();
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      SmallVector<VPValue *> Ops;
+      Ops.append({ALM, Plan.getOrAddLiveIn(
+                           ConstantInt::get(IntegerType::getInt64Ty(Ctx),
+                                            VF.getKnownMinValue() * Part))});
+      auto *Ext = new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
+                                             IntegerType::getInt1Ty(Ctx), DL);
+      Extracts[Part] = Ext;
+      Ext->insertAfter(ALM);
+    }
+  };
+
+  // Create a list of each active lane mask phi, ordered by unroll part.
+  SmallVector<VPActiveLaneMaskPHIRecipe *> Phis(UF, nullptr);
+  for (VPRecipeBase &R : Header->phis()) {
+    auto *Phi = dyn_cast<VPActiveLaneMaskPHIRecipe>(&R);
+    if (!Phi)
+      continue;
+    VPValue *Index = nullptr;
+    match(Phi->getBackedgeValue(),
+          m_ActiveLaneMask(m_VPValue(Index), m_VPValue(), m_VPValue()));
+    assert(Index && "Expected index from ActiveLaneMask instruction");
+
+    auto *II = dyn_cast<VPInstruction>(Index);
+    if (II && II->getOpcode() == VPInstruction::CanonicalIVIncrementForPart) {
+      auto Part = cast<ConstantInt>(II->getOperand(1)->getLiveInIRValue());
+      Phis[Part->getZExtValue()] = Phi;
+    } else
+      // Anything other than a CanonicalIVIncrementForPart is part 0
+      Phis[0] = Phi;
+  }
+
+  assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
+         "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
+
+  auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
+  auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
+
+  assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
+          LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
+         "Expected incoming values of Phi to be ActiveLaneMasks");
+
+  // When using wide lane masks, the return type of the get.active.lane.mask
+  // intrinsic is VF x UF (last operand).
+  VPValue *ALMMultiplier =
+      Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::getInt64Ty(Ctx), UF));
+  EntryALM->setOperand(2, ALMMultiplier);
+  LoopALM->setOperand(2, ALMMultiplier);
+
+  // Create UF x extract vectors and insert into preheader.
+  SmallVector<VPValue *> EntryExtracts(UF);
+  ExtractFromALM(EntryALM, EntryExtracts);
+
+  // Create UF x extract vectors and insert before the loop compare & branch,
+  // updating the compare to use the first extract.
+  SmallVector<VPValue *> LoopExtracts(UF);
+  ExtractFromALM(LoopALM, LoopExtracts);
+  VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
+  Not->setOperand(0, LoopExtracts[0]);
+
+  // Update the incoming values of active lane mask phis.
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    Phis[Part]->setStartValue(EntryExtracts[Part]);
+    Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
+  }
+
+  return true;
+}
+
 /// Try to simplify the branch condition of \p Plan. This may restrict the
 /// resulting plan to \p BestVF and \p BestUF.
 static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
@@ -1478,8 +1650,8 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
   VPValue *Cond;
   ScalarEvolution &SE = *PSE.getSE();
   if (match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) ||
-      match(Term, m_BranchOnCond(
-                      m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue()))))) {
+      match(Term, m_BranchOnCond(m_Not(m_ActiveLaneMask(
+                      m_VPValue(), m_VPValue(), m_VPValue()))))) {
     // Try to simplify the branch condition if TC <= VF * UF when the latch
     // terminator is   BranchOnCount or BranchOnCond where the input is
     // Not(ActiveLaneMask).
@@ -1558,8 +1730,8 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
   assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
   assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
 
-  bool MadeChange =
-      simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
+  bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
+  MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
   MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
 
   if (MadeChange) {
@@ -1792,6 +1964,110 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
   }
 }
 
+namespace {
+struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
+  static bool isSentinel(const VPSingleDefRecipe *Def) {
+    return Def == getEmptyKey() || Def == getTombstoneKey();
+  }
+
+  /// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
+  /// Returns an optional pair, where the first element indicates whether it is
+  /// an intrinsic ID.
+  static std::optional<std::pair<bool, unsigned>>
+  getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R) {
+    return TypeSwitch<const VPSingleDefRecipe *,
+                      std::optional<std::pair<bool, unsigned>>>(R)
+        .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe,
+              VPWidenSelectRecipe, VPReplicateRecipe>(
+            [](auto *I) { return std::make_pair(false, I->getOpcode()); })
+        .Case<VPWidenIntrinsicRecipe>([](auto *I) {
+          return std::make_pair(true, I->getVectorIntrinsicID());
+        })
+        .Default([](auto *) { return std::nullopt; });
+  }
+
+  /// Returns true if recipe \p Def can be safely handed for CSE.
+  static bool canHandle(const VPSingleDefRecipe *Def) {
+    // We can extend the list of handled recipes in the future,
+    // provided we account for the data embedded in them while checking for
+    // equality or hashing.
+    auto C = getOpcodeOrIntrinsicID(Def);
+
+    // The issue with (Insert|Extract)Value is that the index of the
+    // insert/extract is not a proper operand in LLVM IR, and hence also not in
+    // VPlan.
+    if (!C || (!C->first && (C->second == Instruction::InsertValue ||
+                             C->second == Instruction::ExtractValue)))
+      return false;
+
+    // During CSE, we can only handle recipes that don't read from memory: if
+    // they read from memory, there could be an intervening write to memory
+    // before the next instance is CSE'd, leading to an incorrect result.
+    return !Def->mayReadFromMemory();
+  }
+
+  /// Hash the underlying data of \p Def.
+  static unsigned getHashValue(const VPSingleDefRecipe *Def) {
+    const VPlan *Plan = Def->getParent()->getPlan();
+    VPTypeAnalysis TypeInfo(*Plan);
+    hash_code Result = hash_combine(
+        Def->getVPDefID(), getOpcodeOrIntrinsicID(Def),
+        TypeInfo.inferScalarType(Def), vputils::isSingleScalar(Def),
+        hash_combine_range(Def->operands()));
+    if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
+      if (RFlags->hasPredicate())
+        return hash_combine(Result, RFlags->getPredicate());
+    return Result;
+  }
+
+  /// Check equality of underlying data of \p L and \p R.
+  static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
+    if (isSentinel(L) || isSentinel(R))
+      return L == R;
+    if (L->getVPDefID() != R->getVPDefID() ||
+        getOpcodeOrIntrinsicID(L) != getOpcodeOrIntrinsicID(R) ||
+        vputils::isSingleScalar(L) != vputils::isSingleScalar(R) ||
+        !equal(L->operands(), R->operands()))
+      return false;
+    if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
+      if (LFlags->hasPredicate() &&
+          LFlags->getPredicate() !=
+              cast<VPRecipeWithIRFlags>(R)->getPredicate())
+        return false;
+    const VPlan *Plan = L->getParent()->getPlan();
+    VPTypeAnalysis TypeInfo(*Plan);
+    return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
+  }
+};
+} // end anonymous namespace
+
+/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
+/// Plan.
+void VPlanTransforms::cse(VPlan &Plan) {
+  VPDominatorTree VPDT(Plan);
+  DenseMap<VPSingleDefRecipe *, VPSingleDefRecipe *, VPCSEDenseMapInfo> CSEMap;
+
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+           vp_depth_first_deep(Plan.getEntry()))) {
+    for (VPRecipeBase &R : *VPBB) {
+      auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
+      if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
+        continue;
+      if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
+        // V must dominate Def for a valid replacement.
+        if (!VPDT.dominates(V->getParent(), VPBB))
+          continue;
+        // Only keep flags present on both V and Def.
+        if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
+          RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
+        Def->replaceAllUsesWith(V);
+        continue;
+      }
+      CSEMap[Def] = Def;
+    }
+  }
+}
+
 /// Move loop-invariant recipes out of the vector loop region in \p Plan.
 static void licm(VPlan &Plan) {
   VPBasicBlock *Preheader = Plan.getVectorPreheader();
@@ -1953,10 +2229,10 @@ void VPlanTransforms::optimize(VPlan &Plan) {
   runPass(removeRedundantInductionCasts, Plan);
 
   runPass(simplifyRecipes, Plan);
-  runPass(simplifyBlends, Plan);
   runPass(removeDeadRecipes, Plan);
-  runPass(narrowToSingleScalarRecipes, Plan);
+  runPass(simplifyBlends, Plan);
   runPass(legalizeAndOptimizeInductions, Plan);
+  runPass(narrowToSingleScalarRecipes, Plan);
   runPass(removeRedundantExpandSCEVRecipes, Plan);
   runPass(simplifyRecipes, Plan);
   runPass(removeBranchOnConst, Plan);
@@ -2042,13 +2318,16 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
       "index.part.next");
 
   // Create the active lane mask instruction in the VPlan preheader.
-  auto *EntryALM =
-      Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC},
-                           DL, "active.lane.mask.entry");
+  VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
+      ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
+  auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
+                                        {EntryIncrement, TC, ALMMultiplier}, DL,
+                                        "active.lane.mask.entry");
 
   // Now create the ActiveLaneMaskPhi recipe in the main loop using the
   // preheader ActiveLaneMask instruction.
-  auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
+  auto *LaneMaskPhi =
+      new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc::getUnknown());
   LaneMaskPhi->insertAfter(CanonicalIVPHI);
 
   // Create the active lane mask for the next iteration of the loop before the
@@ -2059,8 +2338,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
       Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
                                   {IncrementValue}, {false, false}, DL);
   auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
-                                   {InLoopIncrement, TripCount}, DL,
-                                   "active.lane.mask.next");
+                                   {InLoopIncrement, TripCount, ALMMultiplier},
+                                   DL, "active.lane.mask.next");
   LaneMaskPhi->addOperand(ALM);
 
   // Replace the original terminator with BranchOnCond. We have to invert the
@@ -2077,12 +2356,10 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
 /// for the header-mask pattern manually.
 static VPSingleDefRecipe *findHeaderMask(VPlan &Plan) {
   SmallVector<VPValue *> WideCanonicalIVs;
-  auto *FoundWidenCanonicalIVUser =
-      find_if(Plan.getCanonicalIV()->users(),
-              [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });
+  auto *FoundWidenCanonicalIVUser = find_if(Plan.getCanonicalIV()->users(),
+                                            IsaPred<VPWidenCanonicalIVRecipe>);
   assert(count_if(Plan.getCanonicalIV()->users(),
-                  [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); }) <=
-             1 &&
+                  IsaPred<VPWidenCanonicalIVRecipe>) <= 1 &&
          "Must have at most one VPWideCanonicalIVRecipe");
   if (FoundWidenCanonicalIVUser != Plan.getCanonicalIV()->users().end()) {
     auto *WideCanonicalIV =
@@ -2125,9 +2402,8 @@ void VPlanTransforms::addActiveLaneMask(
          "DataAndControlFlowWithoutRuntimeCheck implies "
          "UseActiveLaneMaskForControlFlow");
 
-  auto *FoundWidenCanonicalIVUser =
-      find_if(Plan.getCanonicalIV()->users(),
-              [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });
+  auto *FoundWidenCanonicalIVUser = find_if(Plan.getCanonicalIV()->users(),
+                                            IsaPred<VPWidenCanonicalIVRecipe>);
   assert(FoundWidenCanonicalIVUser &&
          "Must have widened canonical IV when tail folding!");
   VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan);
@@ -2139,9 +2415,12 @@ void VPlanTransforms::addActiveLaneMask(
         Plan, DataAndControlFlowWithoutRuntimeCheck);
   } else {
     VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
-    LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask,
-                              {WideCanonicalIV, Plan.getTripCount()}, nullptr,
-                              "active.lane.mask");
+    VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
+        ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
+    LaneMask =
+        B.createNaryOp(VPInstruction::ActiveLaneMask,
+                       {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
+                       nullptr, "active.lane.mask");
   }
 
   // Walk users of WideCanonicalIV and replace the header mask of the form
@@ -2205,6 +2484,10 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
         VPValue *NewAddr = GetNewAddr(S->getAddr());
         return new VPWidenStoreEVLRecipe(*S, NewAddr, EVL, NewMask);
       })
+      .Case<VPInterleaveRecipe>([&](VPInterleaveRecipe *IR) {
+        VPValue *NewMask = GetNewMask(IR->getMask());
+        return new VPInterleaveEVLRecipe(*IR, EVL, NewMask);
+      })
       .Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
         VPValue *NewMask = GetNewMask(Red->getCondOp());
         return new VPReductionEVLRecipe(*Red, EVL, NewMask);
@@ -2271,11 +2554,11 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
     VPBuilder Builder(LoopRegion->getPreheaderVPBB());
     MaxEVL = Builder.createScalarZExtOrTrunc(
         MaxEVL, Type::getInt32Ty(Plan.getContext()),
-        TypeInfo.inferScalarType(MaxEVL), DebugLoc());
+        TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
 
     Builder.setInsertPoint(Header, Header->getFirstNonPhi());
-    VPValue *PrevEVL =
-        Builder.createScalarPhi({MaxEVL, &EVL}, DebugLoc(), "prev.evl");
+    VPValue *PrevEVL = Builder.createScalarPhi(
+        {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
 
     for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
              vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()))) {
@@ -2327,16 +2610,17 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
     if (!EVLRecipe)
       continue;
 
-    [[maybe_unused]] unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
+    unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
     assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
            "New recipe must define the same number of values as the "
            "original.");
-    assert(NumDefVal <= 1 &&
-           "Only supports recipes with a single definition or without users.");
     EVLRecipe->insertBefore(CurRecipe);
-    if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(EVLRecipe)) {
-      VPValue *CurVPV = CurRecipe->getVPSingleValue();
-      CurVPV->replaceAllUsesWith(EVLRecipe->getVPSingleValue());
+    if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe, VPInterleaveEVLRecipe>(
+            EVLRecipe)) {
+      for (unsigned I = 0; I < NumDefVal; ++I) {
+        VPValue *CurVPV = CurRecipe->getVPValue(I);
+        CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(I));
+      }
     }
     ToErase.push_back(CurRecipe);
   }
@@ -2404,7 +2688,7 @@ void VPlanTransforms::addExplicitVectorLength(
   VPValue *StartV = CanonicalIVPHI->getStartValue();
 
   // Create the ExplicitVectorLengthPhi recipe in the main loop.
-  auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc());
+  auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc::getUnknown());
   EVLPhi->insertAfter(CanonicalIVPHI);
   VPBuilder Builder(Header, Header->getFirstNonPhi());
   // Create the AVL (application vector length), starting from TC -> 0 in steps
@@ -2418,10 +2702,11 @@ void VPlanTransforms::addExplicitVectorLength(
     VPValue *AVLSafe =
         Plan.getOrAddLiveIn(ConstantInt::get(CanIVTy, *MaxSafeElements));
     VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
-    AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc(), "safe_avl");
+    AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
+                               "safe_avl");
   }
   auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
-                                     DebugLoc());
+                                     DebugLoc::getUnknown());
 
   auto *CanonicalIVIncrement =
       cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
@@ -2473,6 +2758,22 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) {
 
   VPBasicBlock *HeaderVPBB = EVLPhi->getParent();
   VPValue *EVLIncrement = EVLPhi->getBackedgeValue();
+  VPValue *AVL;
+  [[maybe_unused]] bool FoundAVL =
+      match(EVLIncrement,
+            m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi)));
+  assert(FoundAVL && "Didn't find AVL?");
+
+  // The AVL may be capped to a safe distance.
+  VPValue *SafeAVL;
+  if (match(AVL, m_Select(m_VPValue(), m_VPValue(SafeAVL), m_VPValue())))
+    AVL = SafeAVL;
+
+  VPValue *AVLNext;
+  [[maybe_unused]] bool FoundAVLNext =
+      match(AVL, m_VPInstruction<Instruction::PHI>(
+                     m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
+  assert(FoundAVLNext && "Didn't find AVL backedge?");
 
   // Convert EVLPhi to concrete recipe.
   auto *ScalarR =
@@ -2496,7 +2797,7 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) {
 
   // Replace the use of VectorTripCount in the latch-exiting block.
   // Before: (branch-on-count EVLIVInc, VectorTripCount)
-  // After: (branch-on-count EVLIVInc, TripCount)
+  // After: (branch-on-cond eq AVLNext, 0)
 
   VPBasicBlock *LatchExiting =
       HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock();
@@ -2509,7 +2810,54 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) {
                m_BranchOnCount(m_VPValue(EVLIncrement),
                                m_Specific(&Plan.getVectorTripCount()))) &&
          "Unexpected terminator in EVL loop");
-  LatchExitingBr->setOperand(1, Plan.getTripCount());
+
+  Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
+  VPBuilder Builder(LatchExitingBr);
+  VPValue *Cmp =
+      Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
+                         Plan.getOrAddLiveIn(ConstantInt::getNullValue(AVLTy)));
+  Builder.createNaryOp(VPInstruction::BranchOnCond, Cmp);
+  LatchExitingBr->eraseFromParent();
+}
+
+void VPlanTransforms::replaceSymbolicStrides(
+    VPlan &Plan, PredicatedScalarEvolution &PSE,
+    const DenseMap<Value *, const SCEV *> &StridesMap) {
+  // Replace VPValues for known constant strides guaranteed by predicate scalar
+  // evolution.
+  auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
+    auto *R = cast<VPRecipeBase>(&U);
+    return R->getParent()->getParent() ||
+           R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
+  };
+  for (const SCEV *Stride : StridesMap.values()) {
+    using namespace SCEVPatternMatch;
+    auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
+    const APInt *StrideConst;
+    if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
+      // Only handle constant strides for now.
+      continue;
+
+    auto *CI =
+        Plan.getOrAddLiveIn(ConstantInt::get(Stride->getType(), *StrideConst));
+    if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
+      StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
+
+    // The versioned value may not be used in the loop directly but through a
+    // sext/zext. Add new live-ins in those cases.
+    for (Value *U : StrideV->users()) {
+      if (!isa<SExtInst, ZExtInst>(U))
+        continue;
+      VPValue *StrideVPV = Plan.getLiveIn(U);
+      if (!StrideVPV)
+        continue;
+      unsigned BW = U->getType()->getScalarSizeInBits();
+      APInt C =
+          isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
+      VPValue *CI = Plan.getOrAddLiveIn(ConstantInt::get(U->getType(), C));
+      StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
+    }
+  }
 }
 
 void VPlanTransforms::dropPoisonGeneratingRecipes(
@@ -2785,8 +3133,8 @@ expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR,
   VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
 
   Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
-  Init =
-      Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags, {}, "induction");
+  Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
+                              DebugLoc::getUnknown(), "induction");
 
   // Create the widened phi of the vector IV.
   auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), nullptr,
@@ -2983,9 +3331,11 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {
     R->eraseFromParent();
 }
 
-void VPlanTransforms::handleUncountableEarlyExit(
-    VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan,
-    VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VFRange &Range) {
+void VPlanTransforms::handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB,
+                                                 VPBasicBlock *EarlyExitVPBB,
+                                                 VPlan &Plan,
+                                                 VPBasicBlock *HeaderVPBB,
+                                                 VPBasicBlock *LatchVPBB) {
   VPBlockBase *MiddleVPBB = LatchVPBB->getSuccessors()[0];
   if (!EarlyExitVPBB->getSinglePredecessor() &&
       EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {
@@ -3038,13 +3388,7 @@ void VPlanTransforms::handleUncountableEarlyExit(
     }
 
     VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);
-    auto IsVector = [](ElementCount VF) { return VF.isVector(); };
-    // When the VFs are vectors, need to add `extract` to get the incoming value
-    // from early exit. When the range contains scalar VF, limit the range to
-    // scalar VF to prevent mis-compilation for the range containing both scalar
-    // and vector VFs.
-    if (!IncomingFromEarlyExit->isLiveIn() &&
-        LoopVectorizationPlanner::getDecisionAndClampRange(IsVector, Range)) {
+    if (!IncomingFromEarlyExit->isLiveIn()) {
       // Update the incoming value from the early exit.
       VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
           VPInstruction::FirstActiveLane, {CondToEarlyExit}, nullptr,
@@ -3125,7 +3469,7 @@ static VPExpressionRecipe *
 tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
                                           VPCostContext &Ctx, VFRange &Range) {
   unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
-  if (Opcode != Instruction::Add)
+  if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
     return nullptr;
 
   Type *RedTy = Ctx.Types.inferScalarType(Red);
@@ -3140,8 +3484,8 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
           Type *SrcTy =
               Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
           auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
-          InstructionCost MulAccCost =
-              Ctx.TTI.getMulAccReductionCost(isZExt, RedTy, SrcVecTy, CostKind);
+          InstructionCost MulAccCost = Ctx.TTI.getMulAccReductionCost(
+              isZExt, Opcode, RedTy, SrcVecTy, CostKind);
           InstructionCost MulCost = Mul->computeCost(VF, Ctx);
           InstructionCost RedCost = Red->computeCost(VF, Ctx);
           InstructionCost ExtCost = 0;
@@ -3506,6 +3850,21 @@ VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) {
       Plan.resetTripCount(Exp);
     ExpSCEV->eraseFromParent();
   }
+  assert(none_of(*Entry, IsaPred<VPExpandSCEVRecipe>) &&
+         "VPExpandSCEVRecipes must be at the beginning of the entry block, "
+         "after any VPIRInstructions");
+  // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
+  // to the VPIRBasicBlock.
+  auto EI = Entry->begin();
+  for (Instruction &I : drop_end(*EntryBB)) {
+    if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
+        &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
+      EI++;
+      continue;
+    }
+    VPIRInstruction::create(I)->insertBefore(*Entry, EI);
+  }
+
   return ExpandedSCEVs;
 }
 
@@ -3574,12 +3933,12 @@ static bool isAlreadyNarrow(VPValue *VPV) {
 void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
                                              unsigned VectorRegWidth) {
   VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
-  if (VF.isScalable() || !VectorLoop)
+  if (!VectorLoop)
     return;
 
   VPTypeAnalysis TypeInfo(Plan);
 
-  unsigned FixedVF = VF.getFixedValue();
+  unsigned VFMinVal = VF.getKnownMinValue();
   SmallVector<VPInterleaveRecipe *> StoreGroups;
   for (auto &R : *VectorLoop->getEntryBasicBlock()) {
     if (isa<VPCanonicalIVPHIRecipe>(&R) ||
@@ -3615,7 +3974,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
       continue;
 
     // Bail out on non-consecutive interleave groups.
-    if (!isConsecutiveInterleaveGroup(InterleaveR, FixedVF, TypeInfo,
+    if (!isConsecutiveInterleaveGroup(InterleaveR, VFMinVal, TypeInfo,
                                       VectorRegWidth))
       return;
 
@@ -3672,9 +4031,10 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
     return;
 
   // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
-  auto NarrowOp = [](VPValue *V) -> VPValue * {
+  SmallPtrSet<VPValue *, 4> NarrowedOps;
+  auto NarrowOp = [&NarrowedOps](VPValue *V) -> VPValue * {
     auto *R = V->getDefiningRecipe();
-    if (!R)
+    if (!R || NarrowedOps.contains(V))
       return V;
     if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
       // Narrow interleave group to wide load, as transformed VPlan will only
@@ -3684,6 +4044,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
           LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
           /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
       L->insertBefore(LoadGroup);
+      NarrowedOps.insert(L);
       return L;
     }
 
@@ -3691,6 +4052,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
       assert(RepR->isSingleScalar() &&
              isa<LoadInst>(RepR->getUnderlyingInstr()) &&
              "must be a single scalar load");
+      NarrowedOps.insert(RepR);
       return RepR;
     }
     auto *WideLoad = cast<VPWidenLoadRecipe>(R);
@@ -3704,6 +4066,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
                                     /*IsUniform*/ true,
                                     /*Mask*/ nullptr, *WideLoad);
     N->insertBefore(WideLoad);
+    NarrowedOps.insert(N);
     return N;
   };
 
@@ -3734,10 +4097,21 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
   // original iteration.
   auto *CanIV = Plan.getCanonicalIV();
   auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
-  Inc->setOperand(1, Plan.getOrAddLiveIn(ConstantInt::get(
-                         CanIV->getScalarType(), 1 * Plan.getUF())));
-  Plan.getVF().replaceAllUsesWith(
-      Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
+  VPBuilder PHBuilder(Plan.getVectorPreheader());
+
+  VPValue *UF = Plan.getOrAddLiveIn(
+      ConstantInt::get(CanIV->getScalarType(), 1 * Plan.getUF()));
+  if (VF.isScalable()) {
+    VPValue *VScale = PHBuilder.createElementCount(
+        CanIV->getScalarType(), ElementCount::getScalable(1));
+    VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF});
+    Inc->setOperand(1, VScaleUF);
+    Plan.getVF().replaceAllUsesWith(VScale);
+  } else {
+    Inc->setOperand(1, UF);
+    Plan.getVF().replaceAllUsesWith(
+        Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
+  }
   removeDeadRecipes(Plan);
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 700b94621d5f..1957428fab79 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -62,16 +62,47 @@ struct VPlanTransforms {
   /// The created loop is wrapped in an initial skeleton to facilitate
   /// vectorization, consisting of a vector pre-header, an exit block for the
   /// main vector loop (middle.block) and a new block as preheader of the scalar
-  /// loop (scalar.ph). It also adds a canonical IV and its increment, using \p
-  /// InductionTy and \p IVDL, and creates a VPValue expression for the original
-  /// trip count.
+  /// loop (scalar.ph). See below for an illustration. It also adds a canonical
+  /// IV and its increment, using \p InductionTy and \p IVDL, and creates a
+  /// VPValue expression for the original trip count.
+  ///
+  ///    [ ] <-- Plan's entry VPIRBasicBlock, wrapping the original loop's
+  ///    / \       old preheader. Will contain iteration number check and SCEV
+  ///   |   |      expansions.
+  ///   |   |
+  ///   /   v
+  ///  |   [ ] <-- vector loop bypass (may consist of multiple blocks) will be
+  ///  |  / |      added later.
+  ///  | /  v
+  ///  ||  [ ]     <-- vector pre header.
+  ///  |/   |
+  ///  |    v
+  ///  |   [  ] \  <-- plain CFG loop wrapping original loop to be vectorized.
+  ///  |   [  ]_|
+  ///  |    |
+  ///  |    v
+  ///  |   [ ]   <--- middle-block with the branch to successors
+  ///  |   / |
+  ///  |  /  |
+  ///  | |   v
+  ///  \--->[ ]   <--- scalar preheader (initial a VPBasicBlock, which will be
+  ///    |   |        replaced later by a VPIRBasicBlock wrapping the scalar
+  ///    |   |         preheader basic block.
+  ///    |   |
+  ///        v      <-- edge from middle to exit iff epilogue is not required.
+  ///    |  [ ] \
+  ///    |  [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue,
+  ///    |   |          header wrapped in VPIRBasicBlock).
+  ///    \   |
+  ///     \  v
+  ///      >[ ]     <-- original loop exit block(s), wrapped in VPIRBasicBlocks.
   LLVM_ABI_FOR_TEST static std::unique_ptr<VPlan>
   buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, DebugLoc IVDL,
               PredicatedScalarEvolution &PSE);
 
   /// Update \p Plan to account for all early exits.
-  LLVM_ABI_FOR_TEST static void
-  handleEarlyExits(VPlan &Plan, bool HasUncountableExit, VFRange &Range);
+  LLVM_ABI_FOR_TEST static void handleEarlyExits(VPlan &Plan,
+                                                 bool HasUncountableExit);
 
   /// If a check is needed to guard executing the scalar epilogue loop, it will
   /// be added to the middle block.
@@ -79,6 +110,13 @@ struct VPlanTransforms {
                                                bool RequiresScalarEpilogueCheck,
                                                bool TailFolded);
 
+  // Create a check to \p Plan to see if the vector loop should be executed.
+  static void addMinimumIterationCheck(
+      VPlan &Plan, ElementCount VF, unsigned UF,
+      ElementCount MinProfitableTripCount, bool RequiresScalarEpilogue,
+      bool TailFolded, bool CheckNeededWithTailFolding, Loop *OrigLoop,
+      const uint32_t *MinItersBypassWeights, DebugLoc DL, ScalarEvolution &SE);
+
   /// Replace loops in \p Plan's flat CFG with VPRegionBlocks, turning \p Plan's
   /// flat CFG into a hierarchical CFG.
   LLVM_ABI_FOR_TEST static void createLoopRegions(VPlan &Plan);
@@ -161,6 +199,12 @@ struct VPlanTransforms {
   truncateToMinimalBitwidths(VPlan &Plan,
                              const MapVector<Instruction *, uint64_t> &MinBWs);
 
+  /// Replace symbolic strides from \p StridesMap in \p Plan with constants when
+  /// possible.
+  static void
+  replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE,
+                         const DenseMap<Value *, const SCEV *> &StridesMap);
+
   /// Drop poison flags from recipes that may generate a poison value that is
   /// used after vectorization, even when their operands are not poison. Those
   /// recipes meet the following conditions:
@@ -207,8 +251,7 @@ struct VPlanTransforms {
   static void handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB,
                                          VPBasicBlock *EarlyExitVPBB,
                                          VPlan &Plan, VPBasicBlock *HeaderVPBB,
-                                         VPBasicBlock *LatchVPBB,
-                                         VFRange &Range);
+                                         VPBasicBlock *LatchVPBB);
 
   /// Replace loop regions with explicit CFG.
   static void dissolveLoopRegions(VPlan &Plan);
@@ -220,9 +263,10 @@ struct VPlanTransforms {
   /// variable vector lengths instead of fixed lengths. This transformation:
   ///  * Makes EVL-Phi concrete.
   //   * Removes CanonicalIV and increment.
-  ///  * Replaces fixed-length stepping (branch-on-cond CanonicalIVInc,
-  ///    VectorTripCount) with variable-length stepping (branch-on-cond
-  ///    EVLIVInc, TripCount).
+  ///  * Replaces the exit condition from
+  ///      (branch-on-count CanonicalIVInc, VectorTripCount)
+  ///    to
+  ///      (branch-on-cond eq AVLNext, 0)
   static void canonicalizeEVLLoops(VPlan &Plan);
 
   /// Lower abstract recipes to concrete ones, that can be codegen'd.
@@ -242,6 +286,9 @@ struct VPlanTransforms {
   /// removing dead edges to their successors.
   static void removeBranchOnConst(VPlan &Plan);
 
+  /// Perform common-subexpression-elimination on \p Plan.
+  static void cse(VPlan &Plan);
+
   /// If there's a single exit block, optimize its phi recipes that use exiting
   /// IV values by feeding them precomputed end values instead, possibly taken
   /// one step backwards.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index 4bcde8cd5d42..443df167378b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -92,18 +92,18 @@ public:
   void addRecipeForPart(VPRecipeBase *OrigR, VPRecipeBase *CopyR,
                         unsigned Part) {
     for (const auto &[Idx, VPV] : enumerate(OrigR->definedValues())) {
-      auto Ins = VPV2Parts.insert({VPV, {}});
-      assert(Ins.first->second.size() == Part - 1 && "earlier parts not set");
-      Ins.first->second.push_back(CopyR->getVPValue(Idx));
+      const auto &[V, _] = VPV2Parts.try_emplace(VPV);
+      assert(V->second.size() == Part - 1 && "earlier parts not set");
+      V->second.push_back(CopyR->getVPValue(Idx));
     }
   }
 
   /// Given a uniform recipe \p R, add it for all parts.
   void addUniformForAllParts(VPSingleDefRecipe *R) {
-    auto Ins = VPV2Parts.insert({R, {}});
-    assert(Ins.second && "uniform value already added");
+    const auto &[V, Inserted] = VPV2Parts.try_emplace(R);
+    assert(Inserted && "uniform value already added");
     for (unsigned Part = 0; Part != UF; ++Part)
-      Ins.first->second.push_back(R);
+      V->second.push_back(R);
   }
 
   bool contains(VPValue *VPV) const { return VPV2Parts.contains(VPV); }
@@ -536,16 +536,9 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
 
       VPBuilder Builder(RepR);
       if (RepR->getNumUsers() == 0) {
-        if (isa<StoreInst>(RepR->getUnderlyingInstr()) &&
-            vputils::isSingleScalar(RepR->getOperand(1))) {
-          // Stores to invariant addresses need to store the last lane only.
-          cloneForLane(Plan, Builder, IdxTy, RepR, VPLane::getLastLaneForVF(VF),
-                       Def2LaneDefs);
-        } else {
-          // Create single-scalar version of RepR for all lanes.
-          for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
-            cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs);
-        }
+        // Create single-scalar version of RepR for all lanes.
+        for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
+          cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs);
         RepR->eraseFromParent();
         continue;
       }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 700a733bf9f2..c6c1ef336982 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -65,7 +65,7 @@ bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) {
   VPValue *A, *B;
   using namespace VPlanPatternMatch;
 
-  if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B))))
+  if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B), m_SpecificInt(1))))
     return B == Plan.getTripCount() &&
            (match(A, m_ScalarIVSteps(m_Specific(Plan.getCanonicalIV()),
                                      m_SpecificInt(1),
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index 9e1d325a4d8d..77c099b27171 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -49,6 +49,8 @@ inline bool isSingleScalar(const VPValue *VPV) {
     case Instruction::GetElementPtr:
     case Instruction::ICmp:
     case Instruction::FCmp:
+    case Instruction::Select:
+    case VPInstruction::Not:
     case VPInstruction::Broadcast:
     case VPInstruction::PtrAdd:
       return true;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 24f6d61512ef..85c6c2c8d796 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -38,7 +38,7 @@ struct VPDoubleValueDef;
 class VPSlotTracker;
 class VPUser;
 class VPRecipeBase;
-class VPInterleaveRecipe;
+class VPInterleaveBase;
 class VPPhiAccessors;
 
 // This is the base class of the VPlan Def/Use graph, used for modeling the data
@@ -48,7 +48,7 @@ class VPPhiAccessors;
 class LLVM_ABI_FOR_TEST VPValue {
   friend class VPDef;
   friend struct VPDoubleValueDef;
-  friend class VPInterleaveRecipe;
+  friend class VPInterleaveBase;
   friend class VPlan;
   friend class VPExpressionRecipe;
 
@@ -335,6 +335,7 @@ public:
     VPExpressionSC,
     VPIRInstructionSC,
     VPInstructionSC,
+    VPInterleaveEVLSC,
     VPInterleaveSC,
     VPReductionEVLSC,
     VPReductionSC,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index e25ffe135418..99f3bc367a54 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -166,7 +166,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
           }
           return VerifyEVLUse(*R, 2);
         })
-        .Case<VPWidenLoadEVLRecipe, VPVectorEndPointerRecipe>(
+        .Case<VPWidenLoadEVLRecipe, VPVectorEndPointerRecipe,
+              VPInterleaveEVLRecipe>(
             [&](const VPRecipeBase *R) { return VerifyEVLUse(*R, 1); })
         .Case<VPInstructionWithType>(
             [&](const VPInstructionWithType *S) { return VerifyEVLUse(*S, 0); })
@@ -412,7 +413,7 @@ bool VPlanVerifier::verifyRegion(const VPRegionBlock *Region) {
   const VPBlockBase *Exiting = Region->getExiting();
 
   // Entry and Exiting shouldn't have any predecessor/successor, respectively.
-  if (Entry->getNumPredecessors() != 0) {
+  if (Entry->hasPredecessors()) {
     errs() << "region entry block has predecessors\n";
     return false;
   }
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 092a3a87954f..17cb18a22336 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -99,6 +99,10 @@ private:
 
   InstructionWorklist Worklist;
 
+  /// Next instruction to iterate. It will be updated when it is erased by
+  /// RecursivelyDeleteTriviallyDeadInstructions.
+  Instruction *NextInst;
+
   // TODO: Direct calls from the top-level "run" loop use a plain "Instruction"
   //       parameter. That should be updated to specific sub-classes because the
   //       run loop was changed to dispatch on opcode.
@@ -118,6 +122,7 @@ private:
   bool foldInsExtBinop(Instruction &I);
   bool foldInsExtVectorToShuffle(Instruction &I);
   bool foldBitOpOfCastops(Instruction &I);
+  bool foldBitOpOfCastConstant(Instruction &I);
   bool foldBitcastShuffle(Instruction &I);
   bool scalarizeOpOrCmp(Instruction &I);
   bool scalarizeVPIntrinsic(Instruction &I);
@@ -169,13 +174,16 @@ private:
     // further folds that were hindered by OneUse limits.
     SmallPtrSet<Value *, 4> Visited;
     for (Value *Op : Ops) {
-      if (Visited.insert(Op).second) {
+      if (!Visited.contains(Op)) {
         if (auto *OpI = dyn_cast<Instruction>(Op)) {
           if (RecursivelyDeleteTriviallyDeadInstructions(
-                  OpI, nullptr, nullptr, [this](Value *V) {
-                    if (auto I = dyn_cast<Instruction>(V)) {
+                  OpI, nullptr, nullptr, [&](Value *V) {
+                    if (auto *I = dyn_cast<Instruction>(V)) {
                       LLVM_DEBUG(dbgs() << "VC: Erased: " << *I << '\n');
                       Worklist.remove(I);
+                      if (I == NextInst)
+                        NextInst = NextInst->getNextNode();
+                      Visited.insert(I);
                     }
                   }))
             continue;
@@ -862,14 +870,17 @@ bool VectorCombine::foldBitOpOfCastops(Instruction &I) {
   if (LHSSrc->getType() != RHSSrc->getType())
     return false;
 
-  // Only handle vector types with integer elements
-  auto *SrcVecTy = dyn_cast<FixedVectorType>(LHSSrc->getType());
-  auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType());
-  if (!SrcVecTy || !DstVecTy)
+  auto *SrcTy = LHSSrc->getType();
+  auto *DstTy = I.getType();
+  // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
+  // Other casts only handle vector types with integer elements.
+  if (CastOpcode != Instruction::BitCast &&
+      (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
     return false;
 
-  if (!SrcVecTy->getScalarType()->isIntegerTy() ||
-      !DstVecTy->getScalarType()->isIntegerTy())
+  // Only integer scalar/vector values are legal for bitwise logic operations.
+  if (!SrcTy->getScalarType()->isIntegerTy() ||
+      !DstTy->getScalarType()->isIntegerTy())
     return false;
 
   // Cost Check :
@@ -877,23 +888,21 @@ bool VectorCombine::foldBitOpOfCastops(Instruction &I) {
   // NewCost = bitlogic + cast
 
   // Calculate specific costs for each cast with instruction context
-  InstructionCost LHSCastCost =
-      TTI.getCastInstrCost(CastOpcode, DstVecTy, SrcVecTy,
-                           TTI::CastContextHint::None, CostKind, LHSCast);
-  InstructionCost RHSCastCost =
-      TTI.getCastInstrCost(CastOpcode, DstVecTy, SrcVecTy,
-                           TTI::CastContextHint::None, CostKind, RHSCast);
+  InstructionCost LHSCastCost = TTI.getCastInstrCost(
+      CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
+  InstructionCost RHSCastCost = TTI.getCastInstrCost(
+      CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, RHSCast);
 
   InstructionCost OldCost =
-      TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstVecTy, CostKind) +
+      TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstTy, CostKind) +
       LHSCastCost + RHSCastCost;
 
   // For new cost, we can't provide an instruction (it doesn't exist yet)
   InstructionCost GenericCastCost = TTI.getCastInstrCost(
-      CastOpcode, DstVecTy, SrcVecTy, TTI::CastContextHint::None, CostKind);
+      CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
 
   InstructionCost NewCost =
-      TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcVecTy, CostKind) +
+      TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcTy, CostKind) +
       GenericCastCost;
 
   // Account for multi-use casts using specific costs
@@ -930,6 +939,102 @@ bool VectorCombine::foldBitOpOfCastops(Instruction &I) {
   return true;
 }
 
+/// Match:
+// bitop(castop(x), C) ->
+// bitop(castop(x), castop(InvC)) ->
+// castop(bitop(x, InvC))
+// Supports: bitcast
+bool VectorCombine::foldBitOpOfCastConstant(Instruction &I) {
+  Instruction *LHS;
+  Constant *C;
+
+  // Check if this is a bitwise logic operation
+  if (!match(&I, m_c_BitwiseLogic(m_Instruction(LHS), m_Constant(C))))
+    return false;
+
+  // Get the cast instructions
+  auto *LHSCast = dyn_cast<CastInst>(LHS);
+  if (!LHSCast)
+    return false;
+
+  Instruction::CastOps CastOpcode = LHSCast->getOpcode();
+
+  // Only handle supported cast operations
+  switch (CastOpcode) {
+  case Instruction::BitCast:
+    break;
+  default:
+    return false;
+  }
+
+  Value *LHSSrc = LHSCast->getOperand(0);
+
+  auto *SrcTy = LHSSrc->getType();
+  auto *DstTy = I.getType();
+  // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
+  // Other casts only handle vector types with integer elements.
+  if (CastOpcode != Instruction::BitCast &&
+      (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
+    return false;
+
+  // Only integer scalar/vector values are legal for bitwise logic operations.
+  if (!SrcTy->getScalarType()->isIntegerTy() ||
+      !DstTy->getScalarType()->isIntegerTy())
+    return false;
+
+  // Find the constant InvC, such that castop(InvC) equals to C.
+  PreservedCastFlags RHSFlags;
+  Constant *InvC = getLosslessInvCast(C, SrcTy, CastOpcode, *DL, &RHSFlags);
+  if (!InvC)
+    return false;
+
+  // Cost Check :
+  // OldCost = bitlogic + cast
+  // NewCost = bitlogic + cast
+
+  // Calculate specific costs for each cast with instruction context
+  InstructionCost LHSCastCost = TTI.getCastInstrCost(
+      CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
+
+  InstructionCost OldCost =
+      TTI.getArithmeticInstrCost(I.getOpcode(), DstTy, CostKind) + LHSCastCost;
+
+  // For new cost, we can't provide an instruction (it doesn't exist yet)
+  InstructionCost GenericCastCost = TTI.getCastInstrCost(
+      CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
+
+  InstructionCost NewCost =
+      TTI.getArithmeticInstrCost(I.getOpcode(), SrcTy, CostKind) +
+      GenericCastCost;
+
+  // Account for multi-use casts using specific costs
+  if (!LHSCast->hasOneUse())
+    NewCost += LHSCastCost;
+
+  LLVM_DEBUG(dbgs() << "foldBitOpOfCastConstant: OldCost=" << OldCost
+                    << " NewCost=" << NewCost << "\n");
+
+  if (NewCost > OldCost)
+    return false;
+
+  // Create the operation on the source type
+  Value *NewOp = Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(),
+                                     LHSSrc, InvC, I.getName() + ".inner");
+  if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
+    NewBinOp->copyIRFlags(&I);
+
+  Worklist.pushValue(NewOp);
+
+  // Create the cast operation directly to ensure we get a new instruction
+  Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
+
+  // Insert the new instruction
+  Value *Result = Builder.Insert(NewCast);
+
+  replaceValue(I, *Result);
+  return true;
+}
+
 /// If this is a bitcast of a shuffle, try to bitcast the source vector to the
 /// destination type followed by shuffle. This can enable further transforms by
 /// moving bitcasts or shuffles together.
@@ -1461,8 +1566,8 @@ static void analyzeCostOfVecReduction(const IntrinsicInst &II,
                              TTI::CastContextHint::None, CostKind, RedOp);
 
     CostBeforeReduction = ExtCost * 2 + MulCost + Ext2Cost;
-    CostAfterReduction =
-        TTI.getMulAccReductionCost(IsUnsigned, II.getType(), ExtType, CostKind);
+    CostAfterReduction = TTI.getMulAccReductionCost(
+        IsUnsigned, ReductionOpc, II.getType(), ExtType, CostKind);
     return;
   }
   CostAfterReduction = TTI.getArithmeticReductionCost(ReductionOpc, VecRedTy,
@@ -3753,6 +3858,8 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
   unsigned MaxVectorSize =
       TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector);
   unsigned MaxElementsInVector = MaxVectorSize / ElementSize;
+  if (MaxElementsInVector == 0)
+    return false;
   // When there are multiple shufflevector operations on the same input,
   // especially when the vector length is larger than the register size,
   // identical shuffle patterns may occur across different groups of elements.
@@ -4467,6 +4574,8 @@ bool VectorCombine::run() {
       case Instruction::Xor:
         if (foldBitOpOfCastops(I))
           return true;
+        if (foldBitOpOfCastConstant(I))
+          return true;
         break;
       case Instruction::PHI:
         if (shrinkPhiOfShuffles(I))
@@ -4519,13 +4628,21 @@ bool VectorCombine::run() {
     if (!DT.isReachableFromEntry(&BB))
       continue;
     // Use early increment range so that we can erase instructions in loop.
-    for (Instruction &I : make_early_inc_range(BB)) {
-      if (I.isDebugOrPseudoInst())
-        continue;
-      MadeChange |= FoldInst(I);
+    // make_early_inc_range is not applicable here, as the next iterator may
+    // be invalidated by RecursivelyDeleteTriviallyDeadInstructions.
+    // We manually maintain the next instruction and update it when it is about
+    // to be deleted.
+    Instruction *I = &BB.front();
+    while (I) {
+      NextInst = I->getNextNode();
+      if (!I->isDebugOrPseudoInst())
+        MadeChange |= FoldInst(*I);
+      I = NextInst;
     }
   }
 
+  NextInst = nullptr;
+
   while (!Worklist.isEmpty()) {
     Instruction *I = Worklist.removeOne();
     if (!I)