[𝘀𝗽𝗿] changes introduced through rebaseusers/boomanaiden154/main.githubci-upload-ninja_log-as-an-artifact

Created using spr 1.3.4 [skip ci]
author: Aiden Grossman <aidengrossman@google.com> 2025-04-14 07:38:01 +0000
committer: Aiden Grossman <aidengrossman@google.com> 2025-04-14 07:38:01 +0000
commit: 0b43a0423bbaa22384d522050a295eb564116d95 (patch)
tree: 18b4111bcb0563e9f7279666299318617dacc017
parent: b7163d6a2490a7517cb8a526e36a877a4fd7bede (diff)
parent: 97bc9137e545423334b00d60ab64855ccc434c3a (diff)
223 files changed, 4484 insertions, 1996 deletions
diff --git a/clang-tools-extra/clangd/CMakeLists.txt b/clang-tools-extra/clangd/CMakeLists.txt
index 6f10afe4a562..a1e9da41b4b3 100644
--- a/clang-tools-extra/clangd/CMakeLists.txt
+++ b/clang-tools-extra/clangd/CMakeLists.txt
@@ -210,6 +210,9 @@ if (CLANGD_ENABLE_REMOTE)
   include(AddGRPC)
 endif()
 
+option(CLANGD_BUILD_DEXP "Build the dexp tool as part of Clangd" ON)
+llvm_canonicalize_cmake_booleans(CLANGD_BUILD_DEXP)
+
 if(CLANG_INCLUDE_TESTS)
   add_subdirectory(test)
   add_subdirectory(unittests)
@@ -220,4 +223,7 @@ option(CLANGD_ENABLE_REMOTE "Use gRPC library to enable remote index support for
 set(GRPC_INSTALL_PATH "" CACHE PATH "Path to gRPC library manual installation.")
 
 add_subdirectory(index/remote)
-add_subdirectory(index/dex/dexp)
+
+if(CLANGD_BUILD_DEXP)
+  add_subdirectory(index/dex/dexp)
+endif()
diff --git a/clang-tools-extra/clangd/test/CMakeLists.txt b/clang-tools-extra/clangd/test/CMakeLists.txt
index b51f461a4986..42fc3506641f 100644
--- a/clang-tools-extra/clangd/test/CMakeLists.txt
+++ b/clang-tools-extra/clangd/test/CMakeLists.txt
@@ -3,8 +3,6 @@ set(CLANGD_TEST_DEPS
   ClangdTests
   clangd-indexer
   split-file
-  # No tests for it, but we should still make sure they build.
-  dexp
   )
 
 if(CLANGD_BUILD_XPC)
@@ -12,6 +10,11 @@ if(CLANGD_BUILD_XPC)
   list(APPEND CLANGD_TEST_DEPS ClangdXpcUnitTests)
 endif()
 
+if(CLANGD_BUILD_DEXP)
+  # No tests for it, but we should still make sure they build.
+  list(APPEND CLANGD_TEST_DEPS dexp)
+endif()
+
 if(CLANGD_ENABLE_REMOTE)
   list(APPEND CLANGD_TEST_DEPS clangd-index-server clangd-index-server-monitor)
 endif()
diff --git a/clang-tools-extra/clangd/test/lit.site.cfg.py.in b/clang-tools-extra/clangd/test/lit.site.cfg.py.in
index 1fe7c8d0f324..a0bb3561e19e 100644
--- a/clang-tools-extra/clangd/test/lit.site.cfg.py.in
+++ b/clang-tools-extra/clangd/test/lit.site.cfg.py.in
@@ -15,6 +15,7 @@ config.llvm_shlib_dir = "@SHLIBDIR@"
 config.clangd_source_dir = "@CMAKE_CURRENT_SOURCE_DIR@/.."
 config.clangd_binary_dir = "@CMAKE_CURRENT_BINARY_DIR@/.."
 config.clangd_build_xpc = @CLANGD_BUILD_XPC@
+config.clangd_build_dexp = @CLANGD_BUILD_DEXP@
 config.clangd_enable_remote = @CLANGD_ENABLE_REMOTE@
 config.clangd_tidy_checks = @CLANGD_TIDY_CHECKS@
 config.have_zlib = @LLVM_ENABLE_ZLIB@
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 3b8a9cac6587..971ab50cc9a6 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -818,7 +818,23 @@ of different sizes and signs is forbidden in binary and ternary builtins.
  T __builtin_elementwise_fmod(T x, T y)         return The floating-point remainder of (x/y) whose sign                floating point types
                                                 matches the sign of x.
  T __builtin_elementwise_max(T x, T y)          return x or y, whichever is larger                                     integer and floating point types
+                                                For floating point types, follows semantics of maxNum
+                                                in IEEE 754-2008. See `LangRef
+                                                <http://llvm.org/docs/LangRef.html#llvm-min-intrinsics-comparation>`_
+                                                for the comparison.
  T __builtin_elementwise_min(T x, T y)          return x or y, whichever is smaller                                    integer and floating point types
+                                                For floating point types, follows semantics of minNum
+                                                in IEEE 754-2008. See `LangRef
+                                                <http://llvm.org/docs/LangRef.html#llvm-min-intrinsics-comparation>`_
+                                                for the comparison.
+ T __builtin_elementwise_maxnum(T x, T y)       return x or y, whichever is larger. Follows IEEE 754-2008              floating point types
+                                                semantics (maxNum) with +0.0>-0.0. See `LangRef
+                                                <http://llvm.org/docs/LangRef.html#llvm-min-intrinsics-comparation>`_
+                                                for the comparison.
+ T __builtin_elementwise_minnum(T x, T y)       return x or y, whichever is smaller. Follows IEEE 754-2008             floating point types
+                                                semantics (minNum) with +0.0>-0.0. See `LangRef
+                                                <http://llvm.org/docs/LangRef.html#llvm-min-intrinsics-comparation>`_
+                                                for the comparison.
  T __builtin_elementwise_add_sat(T x, T y)      return the sum of x and y, clamped to the range of                     integer types
                                                 representable values for the signed/unsigned integer type.
  T __builtin_elementwise_sub_sat(T x, T y)      return the difference of x and y, clamped to the range of              integer types
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 11f62bc881b0..fd9b9a80e993 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -38,6 +38,9 @@ Potentially Breaking Changes
 - Fix missing diagnostics for uses of declarations when performing typename access,
   such as when performing member access on a '[[deprecated]]' type alias.
   (#GH58547)
+- For ARM targets when compiling assembly files, the features included in the selected CPU
+  or Architecture's FPU are included. If you wish not to use a specific feature,
+  the relevant ``+no`` option will need to be amended to the command line option.
 
 C/C++ Language Potentially Breaking Changes
 -------------------------------------------
@@ -191,6 +194,7 @@ Non-comprehensive list of changes in this release
 - Support parsing the `cc` operand modifier and alias it to the `c` modifier (#GH127719).
 - Added `__builtin_elementwise_exp10`.
 - For AMDPGU targets, added `__builtin_v_cvt_off_f32_i4` that maps to the `v_cvt_off_f32_i4` instruction.
+- Added `__builtin_elementwise_minnum` and `__builtin_elementwise_maxnum`.
 
 New Compiler Flags
 ------------------
@@ -511,6 +515,7 @@ X86 Support
 
 Arm and AArch64 Support
 ^^^^^^^^^^^^^^^^^^^^^^^
+- For ARM targets, cc1as now considers the FPU's features for the selected CPU or Architecture.
 
 Android Support
 ^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 868e5b92acdc..74a11257b373 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -1304,6 +1304,18 @@ def ElementwiseMin : Builtin {
   let Prototype = "void(...)";
 }
 
+def ElementwiseMaxNum : Builtin {
+  let Spellings = ["__builtin_elementwise_maxnum"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ElementwiseMinNum : Builtin {
+  let Spellings = ["__builtin_elementwise_minnum"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
 def ElementwiseMaximum : Builtin {
   let Spellings = ["__builtin_elementwise_maximum"];
   let Attributes = [NoThrow, Const, CustomTypeChecking];
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index c1020b234b13..affc076a876a 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6881,6 +6881,13 @@ let Flags = [TargetSpecific] in {
 defm android_pad_segment : BooleanFFlag<"android-pad-segment">, Group<f_Group>;
 } // let Flags = [TargetSpecific]
 
+def shared_libflangrt : Flag<["-"], "shared-libflangrt">,
+  HelpText<"Link the flang-rt shared library">, Group<Link_Group>,
+  Visibility<[FlangOption]>, Flags<[NoArgumentUnused]>;
+def static_libflangrt : Flag<["-"], "static-libflangrt">, 
+  HelpText<"Link the flang-rt static library">, Group<Link_Group>,
+  Visibility<[FlangOption]>, Flags<[NoArgumentUnused]>;
+
 //===----------------------------------------------------------------------===//
 // FLangOption + NoXarchOption
 //===----------------------------------------------------------------------===//
diff --git a/clang/include/clang/Driver/ToolChain.h b/clang/include/clang/Driver/ToolChain.h
index 076e4296c309..d0059673d6a6 100644
--- a/clang/include/clang/Driver/ToolChain.h
+++ b/clang/include/clang/Driver/ToolChain.h
@@ -521,6 +521,10 @@ public:
   addFortranRuntimeLibraryPath(const llvm::opt::ArgList &Args,
                                llvm::opt::ArgStringList &CmdArgs) const;
 
+  /// Add the path for libflang_rt.runtime.a
+  void addFlangRTLibPath(const llvm::opt::ArgList &Args,
+                         llvm::opt::ArgStringList &CmdArgs) const;
+
   const char *getCompilerRTArgString(const llvm::opt::ArgList &Args,
                                      StringRef Component,
                                      FileType Type = ToolChain::FT_Static,
diff --git a/clang/include/clang/Support/RISCVVIntrinsicUtils.h b/clang/include/clang/Support/RISCVVIntrinsicUtils.h
index 8f2a4f54a1b7..00a79a0fcb5d 100644
--- a/clang/include/clang/Support/RISCVVIntrinsicUtils.h
+++ b/clang/include/clang/Support/RISCVVIntrinsicUtils.h
@@ -11,6 +11,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitmaskEnum.h"
+#include "llvm/ADT/Bitset.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include <cstdint>
@@ -376,6 +377,8 @@ enum PolicyScheme : uint8_t {
   HasPolicyOperand,
 };
 
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, enum PolicyScheme PS);
+
 // TODO refactor RVVIntrinsic class design after support all intrinsic
 // combination. This represents an instantiation of an intrinsic with a
 // particular type and prototype
@@ -507,6 +510,23 @@ enum RVVRequire {
   RVV_REQ_NUM,
 };
 
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, enum RVVRequire Require);
+
+struct RequiredExtensionBits {
+  llvm::Bitset<RVV_REQ_NUM> Bits;
+  RequiredExtensionBits() {}
+  RequiredExtensionBits(std::initializer_list<RVVRequire> Init) {
+    for (auto I : Init)
+      Bits.set(I);
+  }
+
+  void set(unsigned I) { Bits.set(I); }
+  bool operator[](unsigned I) const { return Bits[I]; }
+};
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
+                              const RequiredExtensionBits &Exts);
+
 // Raw RVV intrinsic info, used to expand later.
 // This struct is highly compact for minimized code size.
 struct RVVIntrinsicRecord {
@@ -518,7 +538,7 @@ struct RVVIntrinsicRecord {
   const char *OverloadedName;
 
   // Required target features for this intrinsic.
-  uint32_t RequiredExtensions[(RVV_REQ_NUM + 31) / 32];
+  RequiredExtensionBits RequiredExtensions;
 
   // Prototype for this intrinsic, index of RVVSignatureTable.
   uint16_t PrototypeIndex;
diff --git a/clang/lib/AST/ByteCode/Disasm.cpp b/clang/lib/AST/ByteCode/Disasm.cpp
index 12c434029562..d4c9ce6050b8 100644
--- a/clang/lib/AST/ByteCode/Disasm.cpp
+++ b/clang/lib/AST/ByteCode/Disasm.cpp
@@ -33,39 +33,74 @@
 using namespace clang;
 using namespace clang::interp;
 
-template <typename T> inline static T ReadArg(Program &P, CodePtr &OpPC) {
+template <typename T>
+inline static std::string printArg(Program &P, CodePtr &OpPC) {
   if constexpr (std::is_pointer_v<T>) {
     uint32_t ID = OpPC.read<uint32_t>();
-    return reinterpret_cast<T>(P.getNativePointer(ID));
+    std::string Result;
+    llvm::raw_string_ostream SS(Result);
+    SS << reinterpret_cast<T>(P.getNativePointer(ID));
+    return Result;
   } else {
-    return OpPC.read<T>();
+    std::string Result;
+    llvm::raw_string_ostream SS(Result);
+    auto Arg = OpPC.read<T>();
+    SS << Arg;
+    return Result;
   }
 }
 
-template <> inline Floating ReadArg<Floating>(Program &P, CodePtr &OpPC) {
-  Floating F = Floating::deserialize(*OpPC);
+template <> inline std::string printArg<Floating>(Program &P, CodePtr &OpPC) {
+  auto F = Floating::deserialize(*OpPC);
   OpPC += align(F.bytesToSerialize());
-  return F;
+
+  std::string Result;
+  llvm::raw_string_ostream SS(Result);
+  SS << F;
+  return Result;
 }
 
 template <>
-inline IntegralAP<false> ReadArg<IntegralAP<false>>(Program &P, CodePtr &OpPC) {
-  IntegralAP<false> I = IntegralAP<false>::deserialize(*OpPC);
-  OpPC += align(I.bytesToSerialize());
-  return I;
-}
+inline std::string printArg<IntegralAP<false>>(Program &P, CodePtr &OpPC) {
+  auto F = IntegralAP<false>::deserialize(*OpPC);
+  OpPC += align(F.bytesToSerialize());
 
+  std::string Result;
+  llvm::raw_string_ostream SS(Result);
+  SS << F;
+  return Result;
+}
 template <>
-inline IntegralAP<true> ReadArg<IntegralAP<true>>(Program &P, CodePtr &OpPC) {
-  IntegralAP<true> I = IntegralAP<true>::deserialize(*OpPC);
-  OpPC += align(I.bytesToSerialize());
-  return I;
+inline std::string printArg<IntegralAP<true>>(Program &P, CodePtr &OpPC) {
+  auto F = IntegralAP<true>::deserialize(*OpPC);
+  OpPC += align(F.bytesToSerialize());
+
+  std::string Result;
+  llvm::raw_string_ostream SS(Result);
+  SS << F;
+  return Result;
 }
 
-template <> inline FixedPoint ReadArg<FixedPoint>(Program &P, CodePtr &OpPC) {
-  FixedPoint I = FixedPoint::deserialize(*OpPC);
-  OpPC += align(I.bytesToSerialize());
-  return I;
+template <> inline std::string printArg<FixedPoint>(Program &P, CodePtr &OpPC) {
+  auto F = FixedPoint::deserialize(*OpPC);
+  OpPC += align(F.bytesToSerialize());
+
+  std::string Result;
+  llvm::raw_string_ostream SS(Result);
+  SS << F;
+  return Result;
+}
+
+static bool isJumpOpcode(Opcode Op) {
+  return Op == OP_Jmp || Op == OP_Jf || Op == OP_Jt;
+}
+
+static size_t getNumDisplayWidth(size_t N) {
+  unsigned L = 1u, M = 10u;
+  while (M <= N && ++L != std::numeric_limits<size_t>::digits10 + 1)
+    M *= 10u;
+
+  return L;
 }
 
 LLVM_DUMP_METHOD void Function::dump() const { dump(llvm::errs()); }
@@ -80,23 +115,115 @@ LLVM_DUMP_METHOD void Function::dump(llvm::raw_ostream &OS) const {
   OS << "rvo:        " << hasRVO() << "\n";
   OS << "this arg:   " << hasThisPointer() << "\n";
 
-  auto PrintName = [&OS](const char *Name) {
-    OS << Name;
-    long N = 30 - strlen(Name);
-    if (N > 0)
-      OS.indent(N);
+  struct OpText {
+    size_t Addr;
+    std::string Op;
+    bool IsJump;
+    llvm::SmallVector<std::string> Args;
   };
 
+  auto PrintName = [](const char *Name) -> std::string {
+    return std::string(Name);
+  };
+
+  llvm::SmallVector<OpText> Code;
+  size_t LongestAddr = 0;
+  size_t LongestOp = 0;
+
   for (CodePtr Start = getCodeBegin(), PC = Start; PC != getCodeEnd();) {
     size_t Addr = PC - Start;
+    OpText Text;
     auto Op = PC.read<Opcode>();
-    OS << llvm::format("%8d", Addr) << " ";
+    Text.Addr = Addr;
+    Text.IsJump = isJumpOpcode(Op);
     switch (Op) {
 #define GET_DISASM
 #include "Opcodes.inc"
 #undef GET_DISASM
     }
+    Code.push_back(Text);
+    LongestOp = std::max(Text.Op.size(), LongestOp);
+    LongestAddr = std::max(getNumDisplayWidth(Addr), LongestAddr);
   }
+
+  // Record jumps and their targets.
+  struct JmpData {
+    size_t From;
+    size_t To;
+  };
+  llvm::SmallVector<JmpData> Jumps;
+  for (auto &Text : Code) {
+    if (Text.IsJump)
+      Jumps.push_back({Text.Addr, Text.Addr + std::stoi(Text.Args[0]) +
+                                      align(sizeof(Opcode)) +
+                                      align(sizeof(int32_t))});
+  }
+
+  llvm::SmallVector<std::string> Text;
+  Text.reserve(Code.size());
+  size_t LongestLine = 0;
+  // Print code to a string, one at a time.
+  for (auto C : Code) {
+    std::string Line;
+    llvm::raw_string_ostream LS(Line);
+    LS << C.Addr;
+    LS.indent(LongestAddr - getNumDisplayWidth(C.Addr) + 4);
+    LS << C.Op;
+    LS.indent(LongestOp - C.Op.size() + 4);
+    for (auto &Arg : C.Args) {
+      LS << Arg << ' ';
+    }
+    Text.push_back(Line);
+    LongestLine = std::max(Line.size(), LongestLine);
+  }
+
+  assert(Code.size() == Text.size());
+
+  auto spaces = [](unsigned N) -> std::string {
+    std::string S;
+    for (unsigned I = 0; I != N; ++I)
+      S += ' ';
+    return S;
+  };
+
+  // Now, draw the jump lines.
+  for (auto &J : Jumps) {
+    if (J.To > J.From) {
+      bool FoundStart = false;
+      for (size_t LineIndex = 0; LineIndex != Text.size(); ++LineIndex) {
+        Text[LineIndex] += spaces(LongestLine - Text[LineIndex].size());
+
+        if (Code[LineIndex].Addr == J.From) {
+          Text[LineIndex] += "  --+";
+          FoundStart = true;
+        } else if (Code[LineIndex].Addr == J.To) {
+          Text[LineIndex] += "  <-+";
+          break;
+        } else if (FoundStart) {
+          Text[LineIndex] += "    |";
+        }
+      }
+      LongestLine += 5;
+    } else {
+      bool FoundStart = false;
+      for (ssize_t LineIndex = Text.size() - 1; LineIndex >= 0; --LineIndex) {
+        Text[LineIndex] += spaces(LongestLine - Text[LineIndex].size());
+        if (Code[LineIndex].Addr == J.From) {
+          Text[LineIndex] += "  --+";
+          FoundStart = true;
+        } else if (Code[LineIndex].Addr == J.To) {
+          Text[LineIndex] += "  <-+";
+          break;
+        } else if (FoundStart) {
+          Text[LineIndex] += "    |";
+        }
+      }
+      LongestLine += 5;
+    }
+  }
+
+  for (auto &Line : Text)
+    OS << Line << '\n';
 }
 
 LLVM_DUMP_METHOD void Program::dump() const { dump(llvm::errs()); }
diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp
index 0afd772c73b8..3e1f36da8925 100644
--- a/clang/lib/AST/ByteCode/Interp.cpp
+++ b/clang/lib/AST/ByteCode/Interp.cpp
@@ -307,7 +307,7 @@ bool isConstexprUnknown(const Pointer &P) {
   if (P.isDummy())
     return false;
   const VarDecl *VD = P.block()->getDescriptor()->asVarDecl();
-  return VD && VD->hasLocalStorage();
+  return VD && VD->hasLocalStorage() && !isa<ParmVarDecl>(VD);
 }
 
 bool CheckBCPResult(InterpState &S, const Pointer &Ptr) {
diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h
index 4e84dcc8d551..b4e15b3ffbe6 100644
--- a/clang/lib/AST/ByteCode/Interp.h
+++ b/clang/lib/AST/ByteCode/Interp.h
@@ -771,6 +771,11 @@ bool IncDecHelper(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
                   bool CanOverflow) {
   assert(!Ptr.isDummy());
 
+  if (!S.inConstantContext()) {
+    if (isConstexprUnknown(Ptr))
+      return false;
+  }
+
   if constexpr (std::is_same_v<T, Boolean>) {
     if (!S.getLangOpts().CPlusPlus14)
       return Invalid(S, OpPC);
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index fe55dfffc1cb..1e4e055e04af 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -3818,6 +3818,22 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     return RValue::get(Result);
   }
 
+  case Builtin::BI__builtin_elementwise_maxnum: {
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    Value *Op1 = EmitScalarExpr(E->getArg(1));
+    Value *Result = Builder.CreateBinaryIntrinsic(llvm::Intrinsic::maxnum, Op0,
+                                                  Op1, nullptr, "elt.maxnum");
+    return RValue::get(Result);
+  }
+
+  case Builtin::BI__builtin_elementwise_minnum: {
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    Value *Op1 = EmitScalarExpr(E->getArg(1));
+    Value *Result = Builder.CreateBinaryIntrinsic(llvm::Intrinsic::minnum, Op0,
+                                                  Op1, nullptr, "elt.minnum");
+    return RValue::get(Result);
+  }
+
   case Builtin::BI__builtin_elementwise_maximum: {
     Value *Op0 = EmitScalarExpr(E->getArg(0));
     Value *Op1 = EmitScalarExpr(E->getArg(1));
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index db2a2c574064..bcf039d9f268 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -198,14 +198,10 @@ ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const {
         /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device));
   }
 
-  // FIXME: Should also use this for OpenCL, but it requires addressing the
-  // problem of kernels being called.
-  //
   // FIXME: This doesn't apply the optimization of coercing pointers in structs
   // to global address space when using byref. This would require implementing a
   // new kind of coercion of the in-memory type when for indirect arguments.
-  if (!getContext().getLangOpts().OpenCL && LTy == OrigLTy &&
-      isAggregateTypeForABI(Ty)) {
+  if (LTy == OrigLTy && isAggregateTypeForABI(Ty)) {
     return ABIArgInfo::getIndirectAliased(
         getContext().getTypeAlignInChars(Ty),
         getContext().getTargetAddressSpace(LangAS::opencl_constant),
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 36d0ae34dec8..97317579c8a5 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -744,9 +744,12 @@ std::string ToolChain::buildCompilerRTBasename(const llvm::opt::ArgList &Args,
     Suffix = IsITANMSVCWindows ? ".lib" : ".a";
     break;
   case ToolChain::FT_Shared:
-    Suffix = TT.isOSWindows()
-                 ? (TT.isWindowsGNUEnvironment() ? ".dll.a" : ".lib")
-                 : ".so";
+    if (TT.isOSWindows())
+      Suffix = TT.isWindowsGNUEnvironment() ? ".dll.a" : ".lib";
+    else if (TT.isOSAIX())
+      Suffix = ".a";
+    else
+      Suffix = ".so";
     break;
   }
 
@@ -816,8 +819,7 @@ void ToolChain::addFortranRuntimeLibs(const ArgList &Args,
       if (AsNeeded)
         addAsNeededOption(*this, Args, CmdArgs, /*as_needed=*/false);
     }
-    CmdArgs.push_back("-lflang_rt.runtime");
-    addArchSpecificRPath(*this, Args, CmdArgs);
+    addFlangRTLibPath(Args, CmdArgs);
 
     // needs libexecinfo for backtrace functions
     if (getTriple().isOSFreeBSD() || getTriple().isOSNetBSD() ||
@@ -850,6 +852,20 @@ void ToolChain::addFortranRuntimeLibraryPath(const llvm::opt::ArgList &Args,
     CmdArgs.push_back(Args.MakeArgString("-L" + DefaultLibPath));
 }
 
+void ToolChain::addFlangRTLibPath(const ArgList &Args,
+                                  llvm::opt::ArgStringList &CmdArgs) const {
+  // Link static flang_rt.runtime.a or shared flang_rt.runtime.so.
+  // On AIX, default to static flang-rt.
+  if (Args.hasFlag(options::OPT_static_libflangrt,
+                   options::OPT_shared_libflangrt, getTriple().isOSAIX()))
+    CmdArgs.push_back(
+        getCompilerRTArgString(Args, "runtime", ToolChain::FT_Static, true));
+  else {
+    CmdArgs.push_back("-lflang_rt.runtime");
+    addArchSpecificRPath(*this, Args, CmdArgs);
+  }
+}
+
 // Android target triples contain a target version. If we don't have libraries
 // for the exact target version, we should fall back to the next newest version
 // or a versionless path, if any.
diff --git a/clang/lib/Driver/ToolChains/AIX.cpp b/clang/lib/Driver/ToolChains/AIX.cpp
index 26b9d4c772be..5dc80bc5a3d2 100644
--- a/clang/lib/Driver/ToolChains/AIX.cpp
+++ b/clang/lib/Driver/ToolChains/AIX.cpp
@@ -608,14 +608,6 @@ void AIX::addProfileRTLibs(const llvm::opt::ArgList &Args,
   ToolChain::addProfileRTLibs(Args, CmdArgs);
 }
 
-void AIX::addFortranRuntimeLibs(const ArgList &Args,
-                                llvm::opt::ArgStringList &CmdArgs) const {
-  // Link flang_rt.runtime.a. On AIX, the static and shared library are all
-  // named .a
-  CmdArgs.push_back(
-      getCompilerRTArgString(Args, "runtime", ToolChain::FT_Static, true));
-}
-
 ToolChain::CXXStdlibType AIX::GetDefaultCXXStdlibType() const {
   return ToolChain::CST_Libcxx;
 }
diff --git a/clang/lib/Driver/ToolChains/AIX.h b/clang/lib/Driver/ToolChains/AIX.h
index 17e8370cd121..8f130f6b5454 100644
--- a/clang/lib/Driver/ToolChains/AIX.h
+++ b/clang/lib/Driver/ToolChains/AIX.h
@@ -87,9 +87,6 @@ public:
   void addProfileRTLibs(const llvm::opt::ArgList &Args,
                         llvm::opt::ArgStringList &CmdArgs) const override;
 
-  void addFortranRuntimeLibs(const llvm::opt::ArgList &Args,
-                             llvm::opt::ArgStringList &CmdArgs) const override;
-
   CXXStdlibType GetDefaultCXXStdlibType() const override;
 
   RuntimeLibType GetDefaultRuntimeLibType() const override;
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index d1de3b91c352..35ca019795dd 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -65,6 +65,10 @@ void RocmInstallationDetector::scanLibDevicePath(llvm::StringRef Path) {
       FiniteOnly.Off = FilePath;
     } else if (BaseName == "oclc_finite_only_on") {
       FiniteOnly.On = FilePath;
+    } else if (BaseName == "oclc_daz_opt_on") {
+      DenormalsAreZero.On = FilePath;
+    } else if (BaseName == "oclc_daz_opt_off") {
+      DenormalsAreZero.Off = FilePath;
     } else if (BaseName == "oclc_correctly_rounded_sqrt_on") {
       CorrectlyRoundedSqrt.On = FilePath;
     } else if (BaseName == "oclc_correctly_rounded_sqrt_off") {
@@ -881,6 +885,10 @@ void ROCMToolChain::addClangTargetOptions(
     return;
 
   bool Wave64 = isWave64(DriverArgs, Kind);
+  // TODO: There are way too many flags that change this. Do we need to check
+  // them all?
+  bool DAZ = DriverArgs.hasArg(options::OPT_cl_denorms_are_zero) ||
+             getDefaultDenormsAreZeroForTarget(Kind);
   bool FiniteOnly = DriverArgs.hasArg(options::OPT_cl_finite_math_only);
 
   bool UnsafeMathOpt =
@@ -901,7 +909,7 @@ void ROCMToolChain::addClangTargetOptions(
 
   // Add the generic set of libraries.
   BCLibs.append(RocmInstallation->getCommonBitcodeLibs(
-      DriverArgs, LibDeviceFile, Wave64, FiniteOnly, UnsafeMathOpt,
+      DriverArgs, LibDeviceFile, Wave64, DAZ, FiniteOnly, UnsafeMathOpt,
       FastRelaxedMath, CorrectSqrt, ABIVer, GPUSan, false));
 
   for (auto [BCFile, Internalize] : BCLibs) {
@@ -940,8 +948,9 @@ bool RocmInstallationDetector::checkCommonBitcodeLibs(
 llvm::SmallVector<ToolChain::BitCodeLibraryInfo, 12>
 RocmInstallationDetector::getCommonBitcodeLibs(
     const llvm::opt::ArgList &DriverArgs, StringRef LibDeviceFile, bool Wave64,
-    bool FiniteOnly, bool UnsafeMathOpt, bool FastRelaxedMath, bool CorrectSqrt,
-    DeviceLibABIVersion ABIVer, bool GPUSan, bool isOpenMP) const {
+    bool DAZ, bool FiniteOnly, bool UnsafeMathOpt, bool FastRelaxedMath,
+    bool CorrectSqrt, DeviceLibABIVersion ABIVer, bool GPUSan,
+    bool isOpenMP) const {
   llvm::SmallVector<ToolChain::BitCodeLibraryInfo, 12> BCLibs;
 
   auto AddBCLib = [&](ToolChain::BitCodeLibraryInfo BCLib,
@@ -960,6 +969,7 @@ RocmInstallationDetector::getCommonBitcodeLibs(
     AddBCLib(getOCKLPath());
   else if (GPUSan && isOpenMP)
     AddBCLib(getOCKLPath(), false);
+  AddBCLib(getDenormalsAreZeroPath(DAZ));
   AddBCLib(getUnsafeMathPath(UnsafeMathOpt || FastRelaxedMath));
   AddBCLib(getFiniteOnlyPath(FiniteOnly || FastRelaxedMath));
   AddBCLib(getCorrectlyRoundedSqrtPath(CorrectSqrt));
@@ -987,6 +997,11 @@ ROCMToolChain::getCommonDeviceLibNames(const llvm::opt::ArgList &DriverArgs,
     return {};
 
   // If --hip-device-lib is not set, add the default bitcode libraries.
+  // TODO: There are way too many flags that change this. Do we need to check
+  // them all?
+  bool DAZ = DriverArgs.hasFlag(options::OPT_fgpu_flush_denormals_to_zero,
+                                options::OPT_fno_gpu_flush_denormals_to_zero,
+                                getDefaultDenormsAreZeroForTarget(Kind));
   bool FiniteOnly = DriverArgs.hasFlag(
       options::OPT_ffinite_math_only, options::OPT_fno_finite_math_only, false);
   bool UnsafeMathOpt =
@@ -1006,7 +1021,7 @@ ROCMToolChain::getCommonDeviceLibNames(const llvm::opt::ArgList &DriverArgs,
                 getSanitizerArgs(DriverArgs).needsAsanRt();
 
   return RocmInstallation->getCommonBitcodeLibs(
-      DriverArgs, LibDeviceFile, Wave64, FiniteOnly, UnsafeMathOpt,
+      DriverArgs, LibDeviceFile, Wave64, DAZ, FiniteOnly, UnsafeMathOpt,
       FastRelaxedMath, CorrectSqrt, ABIVer, GPUSan, isOpenMP);
 }
 
diff --git a/clang/lib/Driver/ToolChains/Arch/ARM.cpp b/clang/lib/Driver/ToolChains/Arch/ARM.cpp
index e50cb3836f2c..5084058b3fef 100644
--- a/clang/lib/Driver/ToolChains/Arch/ARM.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/ARM.cpp
@@ -679,21 +679,17 @@ llvm::ARM::FPUKind arm::getARMTargetFeatures(const Driver &D,
         CPUArgFPUKind != llvm::ARM::FK_INVALID ? CPUArgFPUKind : ArchArgFPUKind;
     (void)llvm::ARM::getFPUFeatures(FPUKind, Features);
   } else {
-    bool Generic = true;
-    if (!ForAS) {
-      std::string CPU = arm::getARMTargetCPU(CPUName, ArchName, Triple);
-      if (CPU != "generic")
-        Generic = false;
-      llvm::ARM::ArchKind ArchKind =
-          arm::getLLVMArchKindForARM(CPU, ArchName, Triple);
-      FPUKind = llvm::ARM::getDefaultFPU(CPU, ArchKind);
-      (void)llvm::ARM::getFPUFeatures(FPUKind, Features);
-    }
+    std::string CPU = arm::getARMTargetCPU(CPUName, ArchName, Triple);
+    bool Generic = CPU == "generic";
     if (Generic && (Triple.isOSWindows() || Triple.isOSDarwin()) &&
         getARMSubArchVersionNumber(Triple) >= 7) {
       FPUKind = llvm::ARM::parseFPU("neon");
-      (void)llvm::ARM::getFPUFeatures(FPUKind, Features);
+    } else {
+      llvm::ARM::ArchKind ArchKind =
+          arm::getLLVMArchKindForARM(CPU, ArchName, Triple);
+      FPUKind = llvm::ARM::getDefaultFPU(CPU, ArchKind);
     }
+    (void)llvm::ARM::getFPUFeatures(FPUKind, Features);
   }
 
   // Now we've finished accumulating features from arch, cpu and fpu,
diff --git a/clang/lib/Driver/ToolChains/PPCLinux.cpp b/clang/lib/Driver/ToolChains/PPCLinux.cpp
index 575e88c6ab12..0ed0f91ad166 100644
--- a/clang/lib/Driver/ToolChains/PPCLinux.cpp
+++ b/clang/lib/Driver/ToolChains/PPCLinux.cpp
@@ -12,7 +12,6 @@
 #include "clang/Driver/Options.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/VirtualFileSystem.h"
 
 using namespace clang::driver;
 using namespace clang::driver::toolchains;
@@ -102,18 +101,3 @@ bool PPCLinuxToolChain::SupportIEEEFloat128(
   return GlibcSupportsFloat128((Twine(D.DyldPrefix) + Linker).str()) &&
          !(D.CCCIsCXX() && HasUnsupportedCXXLib);
 }
-
-void PPCLinuxToolChain::addFortranRuntimeLibs(
-    const ArgList &Args, llvm::opt::ArgStringList &CmdArgs) const {
-  // Link static flang_rt.runtime.a or shared flang_rt.runtime.so
-  const char *Path;
-  if (getVFS().exists(Twine(Path = getCompilerRTArgString(
-                                Args, "runtime", ToolChain::FT_Static, true))))
-    CmdArgs.push_back(Path);
-  else if (getVFS().exists(
-               Twine(Path = getCompilerRTArgString(
-                         Args, "runtime", ToolChain::FT_Shared, true))))
-    CmdArgs.push_back(Path);
-  else
-    CmdArgs.push_back("-lflang_rt.runtime");
-}
diff --git a/clang/lib/Driver/ToolChains/PPCLinux.h b/clang/lib/Driver/ToolChains/PPCLinux.h
index 910df3d16e6a..63adaff6be9c 100644
--- a/clang/lib/Driver/ToolChains/PPCLinux.h
+++ b/clang/lib/Driver/ToolChains/PPCLinux.h
@@ -24,9 +24,6 @@ public:
   AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                             llvm::opt::ArgStringList &CC1Args) const override;
 
-  void addFortranRuntimeLibs(const llvm::opt::ArgList &Args,
-                             llvm::opt::ArgStringList &CmdArgs) const override;
-
 private:
   bool SupportIEEEFloat128(const Driver &D, const llvm::Triple &Triple,
                            const llvm::opt::ArgList &Args) const;
diff --git a/clang/lib/Driver/ToolChains/ROCm.h b/clang/lib/Driver/ToolChains/ROCm.h
index f002b386e11c..2a09da011489 100644
--- a/clang/lib/Driver/ToolChains/ROCm.h
+++ b/clang/lib/Driver/ToolChains/ROCm.h
@@ -137,6 +137,7 @@ private:
   ConditionalLibrary WavefrontSize64;
   ConditionalLibrary FiniteOnly;
   ConditionalLibrary UnsafeMath;
+  ConditionalLibrary DenormalsAreZero;
   ConditionalLibrary CorrectlyRoundedSqrt;
 
   // Maps ABI version to library path. The version number is in the format of
@@ -151,7 +152,8 @@ private:
   bool allGenericLibsValid() const {
     return !OCML.empty() && !OCKL.empty() && !OpenCL.empty() &&
            WavefrontSize64.isValid() && FiniteOnly.isValid() &&
-           UnsafeMath.isValid() && CorrectlyRoundedSqrt.isValid();
+           UnsafeMath.isValid() && DenormalsAreZero.isValid() &&
+           CorrectlyRoundedSqrt.isValid();
   }
 
   void scanLibDevicePath(llvm::StringRef Path);
@@ -173,12 +175,11 @@ public:
 
   /// Get file paths of default bitcode libraries common to AMDGPU based
   /// toolchains.
-  llvm::SmallVector<ToolChain::BitCodeLibraryInfo, 12>
-  getCommonBitcodeLibs(const llvm::opt::ArgList &DriverArgs,
-                       StringRef LibDeviceFile, bool Wave64, bool FiniteOnly,
-                       bool UnsafeMathOpt, bool FastRelaxedMath,
-                       bool CorrectSqrt, DeviceLibABIVersion ABIVer,
-                       bool GPUSan, bool isOpenMP) const;
+  llvm::SmallVector<ToolChain::BitCodeLibraryInfo, 12> getCommonBitcodeLibs(
+      const llvm::opt::ArgList &DriverArgs, StringRef LibDeviceFile,
+      bool Wave64, bool DAZ, bool FiniteOnly, bool UnsafeMathOpt,
+      bool FastRelaxedMath, bool CorrectSqrt, DeviceLibABIVersion ABIVer,
+      bool GPUSan, bool isOpenMP) const;
   /// Check file paths of default bitcode libraries common to AMDGPU based
   /// toolchains. \returns false if there are invalid or missing files.
   bool checkCommonBitcodeLibs(StringRef GPUArch, StringRef LibDeviceFile,
@@ -244,6 +245,10 @@ public:
     return UnsafeMath.get(Enabled);
   }
 
+  StringRef getDenormalsAreZeroPath(bool Enabled) const {
+    return DenormalsAreZero.get(Enabled);
+  }
+
   StringRef getCorrectlyRoundedSqrtPath(bool Enabled) const {
     return CorrectlyRoundedSqrt.get(Enabled);
   }
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 82dc403538c4..ef5f07e2c62e 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -5569,7 +5569,8 @@ static bool isAllmanBrace(const FormatToken &Tok) {
 // Returns 'true' if 'Tok' is a function argument.
 static bool IsFunctionArgument(const FormatToken &Tok) {
   return Tok.MatchingParen && Tok.MatchingParen->Next &&
-         Tok.MatchingParen->Next->isOneOf(tok::comma, tok::r_paren);
+         Tok.MatchingParen->Next->isOneOf(tok::comma, tok::r_paren,
+                                          tok::r_brace);
 }
 
 static bool
diff --git a/clang/lib/Format/WhitespaceManager.cpp b/clang/lib/Format/WhitespaceManager.cpp
index b1e43f0313db..cfaadf07edfd 100644
--- a/clang/lib/Format/WhitespaceManager.cpp
+++ b/clang/lib/Format/WhitespaceManager.cpp
@@ -1012,13 +1012,8 @@ void WhitespaceManager::alignConsecutiveDeclarations() {
   AlignTokens(
       Style,
       [&](Change const &C) {
-        if (Style.AlignConsecutiveDeclarations.AlignFunctionPointers) {
-          for (const auto *Prev = C.Tok->Previous; Prev; Prev = Prev->Previous)
-            if (Prev->is(tok::equal))
-              return false;
-          if (C.Tok->is(TT_FunctionTypeLParen))
-            return true;
-        }
+        if (C.Tok->is(TT_FunctionTypeLParen))
+          return Style.AlignConsecutiveDeclarations.AlignFunctionPointers;
         if (C.Tok->is(TT_FunctionDeclarationName))
           return Style.AlignConsecutiveDeclarations.AlignFunctionDeclarations;
         if (C.Tok->isNot(TT_StartOfName))
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index bffd0dd461d3..13bc2bd14621 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -2762,6 +2762,8 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
 
   // These builtins restrict the element type to floating point
   // types only, and take in two arguments.
+  case Builtin::BI__builtin_elementwise_minnum:
+  case Builtin::BI__builtin_elementwise_maxnum:
   case Builtin::BI__builtin_elementwise_minimum:
   case Builtin::BI__builtin_elementwise_maximum:
   case Builtin::BI__builtin_elementwise_atan2:
diff --git a/clang/lib/Sema/SemaRISCV.cpp b/clang/lib/Sema/SemaRISCV.cpp
index 746609604d1b..b9f843b1920a 100644
--- a/clang/lib/Sema/SemaRISCV.cpp
+++ b/clang/lib/Sema/SemaRISCV.cpp
@@ -232,8 +232,7 @@ void RISCVIntrinsicManagerImpl::ConstructRVVIntrinsics(
   for (auto &Record : Recs) {
     // Check requirements.
     if (llvm::any_of(FeatureCheckList, [&](const auto &Item) {
-          return ((Record.RequiredExtensions[Item.second / 32] &
-                   (1U << (Item.second % 32))) != 0) &&
+          return Record.RequiredExtensions[Item.second] &&
                  !TI.hasFeature(Item.first);
         }))
       continue;
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 108d7e1dbaeb..5c80077f294c 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -1280,12 +1280,10 @@ void OpenACCDeclClauseInstantiator::VisitDevicePtrClause(
     const OpenACCDevicePtrClause &C) {
   llvm::SmallVector<Expr *> VarList = VisitVarList(C.getVarList());
   // Ensure each var is a pointer type.
-  VarList.erase(std::remove_if(VarList.begin(), VarList.end(),
-                               [&](Expr *E) {
-                                 return SemaRef.OpenACC().CheckVarIsPointerType(
-                                     OpenACCClauseKind::DevicePtr, E);
-                               }),
-                VarList.end());
+  llvm::erase_if(VarList, [&](Expr *E) {
+    return SemaRef.OpenACC().CheckVarIsPointerType(OpenACCClauseKind::DevicePtr,
+                                                   E);
+  });
   ParsedClause.setVarListDetails(VarList, OpenACCModifierKind::Invalid);
   if (SemaRef.OpenACC().CheckDeclareClause(ParsedClause,
                                            OpenACCModifierKind::Invalid))
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 21e250e172d5..bb58ec49612c 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -11992,13 +11992,10 @@ void OpenACCClauseTransform<Derived>::VisitDetachClause(
   llvm::SmallVector<Expr *> VarList = VisitVarList(C.getVarList());
 
   // Ensure each var is a pointer type.
-  VarList.erase(
-      std::remove_if(VarList.begin(), VarList.end(),
-                     [&](Expr *E) {
-                       return Self.getSema().OpenACC().CheckVarIsPointerType(
-                           OpenACCClauseKind::Detach, E);
-                     }),
-      VarList.end());
+  llvm::erase_if(VarList, [&](Expr *E) {
+    return Self.getSema().OpenACC().CheckVarIsPointerType(
+        OpenACCClauseKind::Detach, E);
+  });
 
   ParsedClause.setVarListDetails(VarList, OpenACCModifierKind::Invalid);
   NewClause = OpenACCDetachClause::Create(
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index a48c05061626..95b5718f1d14 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -1493,42 +1493,45 @@ void ASTWriter::WriteControlBlock(Preprocessor &PP, StringRef isysroot) {
     unsigned AbbrevCode = Stream.EmitAbbrev(std::move(Abbrev));
     RecordData::value_type Record[] = {MODULE_NAME};
     Stream.EmitRecordWithBlob(AbbrevCode, Record, WritingModule->Name);
-  }
 
-  if (WritingModule && WritingModule->Directory) {
-    SmallString<128> BaseDir;
-    if (PP.getHeaderSearchInfo().getHeaderSearchOpts().ModuleFileHomeIsCwd) {
-      // Use the current working directory as the base path for all inputs.
-      auto CWD = FileMgr.getOptionalDirectoryRef(".");
-      BaseDir.assign(CWD->getName());
-    } else {
-      BaseDir.assign(WritingModule->Directory->getName());
-    }
-    cleanPathForOutput(FileMgr, BaseDir);
-
-    // If the home of the module is the current working directory, then we
-    // want to pick up the cwd of the build process loading the module, not
-    // our cwd, when we load this module.
-    if (!PP.getHeaderSearchInfo().getHeaderSearchOpts().ModuleFileHomeIsCwd &&
-        (!PP.getHeaderSearchInfo()
-              .getHeaderSearchOpts()
-              .ModuleMapFileHomeIsCwd ||
-         WritingModule->Directory->getName() != ".")) {
-      // Module directory.
-      auto Abbrev = std::make_shared<BitCodeAbbrev>();
-      Abbrev->Add(BitCodeAbbrevOp(MODULE_DIRECTORY));
-      Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); // Directory
-      unsigned AbbrevCode = Stream.EmitAbbrev(std::move(Abbrev));
+    auto BaseDir = [&]() -> std::optional<SmallString<128>> {
+      if (PP.getHeaderSearchInfo().getHeaderSearchOpts().ModuleFileHomeIsCwd) {
+        // Use the current working directory as the base path for all inputs.
+        auto CWD = FileMgr.getOptionalDirectoryRef(".");
+        return CWD->getName();
+      }
+      if (WritingModule->Directory) {
+        return WritingModule->Directory->getName();
+      }
+      return std::nullopt;
+    }();
+    if (BaseDir) {
+      cleanPathForOutput(FileMgr, *BaseDir);
+
+      // If the home of the module is the current working directory, then we
+      // want to pick up the cwd of the build process loading the module, not
+      // our cwd, when we load this module.
+      if (!PP.getHeaderSearchInfo().getHeaderSearchOpts().ModuleFileHomeIsCwd &&
+          (!PP.getHeaderSearchInfo()
+                .getHeaderSearchOpts()
+                .ModuleMapFileHomeIsCwd ||
+           WritingModule->Directory->getName() != ".")) {
+        // Module directory.
+        auto Abbrev = std::make_shared<BitCodeAbbrev>();
+        Abbrev->Add(BitCodeAbbrevOp(MODULE_DIRECTORY));
+        Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); // Directory
+        unsigned AbbrevCode = Stream.EmitAbbrev(std::move(Abbrev));
+
+        RecordData::value_type Record[] = {MODULE_DIRECTORY};
+        Stream.EmitRecordWithBlob(AbbrevCode, Record, *BaseDir);
+      }
 
-      RecordData::value_type Record[] = {MODULE_DIRECTORY};
-      Stream.EmitRecordWithBlob(AbbrevCode, Record, BaseDir);
+      // Write out all other paths relative to the base directory if possible.
+      BaseDirectory.assign(BaseDir->begin(), BaseDir->end());
+    } else if (!isysroot.empty()) {
+      // Write out paths relative to the sysroot if possible.
+      BaseDirectory = std::string(isysroot);
     }
-
-    // Write out all other paths relative to the base directory if possible.
-    BaseDirectory.assign(BaseDir.begin(), BaseDir.end());
-  } else if (!isysroot.empty()) {
-    // Write out paths relative to the sysroot if possible.
-    BaseDirectory = std::string(isysroot);
   }
 
   // Module map file
diff --git a/clang/lib/Support/RISCVVIntrinsicUtils.cpp b/clang/lib/Support/RISCVVIntrinsicUtils.cpp
index e44fbb018183..6378596ef31e 100644
--- a/clang/lib/Support/RISCVVIntrinsicUtils.cpp
+++ b/clang/lib/Support/RISCVVIntrinsicUtils.cpp
@@ -1196,36 +1196,91 @@ SmallVector<PrototypeDescriptor> parsePrototypes(StringRef Prototypes) {
   return PrototypeDescriptors;
 }
 
+#define STRINGIFY(NAME)                                                        \
+  case NAME:                                                                   \
+    OS << #NAME;                                                               \
+    break;
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, enum PolicyScheme PS) {
+  switch (PS) {
+    STRINGIFY(SchemeNone)
+    STRINGIFY(HasPassthruOperand)
+    STRINGIFY(HasPolicyOperand)
+  }
+  return OS;
+}
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, enum RVVRequire Require) {
+  switch (Require) {
+    STRINGIFY(RVV_REQ_RV64)
+    STRINGIFY(RVV_REQ_Zvfhmin)
+    STRINGIFY(RVV_REQ_Xsfvcp)
+    STRINGIFY(RVV_REQ_Xsfvfnrclipxfqf)
+    STRINGIFY(RVV_REQ_Xsfvfwmaccqqq)
+    STRINGIFY(RVV_REQ_Xsfvqmaccdod)
+    STRINGIFY(RVV_REQ_Xsfvqmaccqoq)
+    STRINGIFY(RVV_REQ_Zvbb)
+    STRINGIFY(RVV_REQ_Zvbc)
+    STRINGIFY(RVV_REQ_Zvkb)
+    STRINGIFY(RVV_REQ_Zvkg)
+    STRINGIFY(RVV_REQ_Zvkned)
+    STRINGIFY(RVV_REQ_Zvknha)
+    STRINGIFY(RVV_REQ_Zvknhb)
+    STRINGIFY(RVV_REQ_Zvksed)
+    STRINGIFY(RVV_REQ_Zvksh)
+    STRINGIFY(RVV_REQ_Zvfbfwma)
+    STRINGIFY(RVV_REQ_Zvfbfmin)
+    STRINGIFY(RVV_REQ_Zvfh)
+    STRINGIFY(RVV_REQ_Experimental)
+  default:
+    llvm_unreachable("Unsupported RVVRequire!");
+    break;
+  }
+  return OS;
+}
+
+#undef STRINGIFY
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
+                              const RequiredExtensionBits &Exts) {
+  OS << "{";
+  ListSeparator LS;
+  for (unsigned I = 0; I < RVV_REQ_NUM; I++)
+    if (Exts[I])
+      OS << LS << static_cast<RVVRequire>(I);
+  OS << "}";
+  return OS;
+}
+
 raw_ostream &operator<<(raw_ostream &OS, const RVVIntrinsicRecord &Record) {
   OS << "{";
-  OS << "\"" << Record.Name << "\",";
+  OS << "/*Name=*/\"" << Record.Name << "\", ";
   if (Record.OverloadedName == nullptr ||
       StringRef(Record.OverloadedName).empty())
-    OS << "nullptr,";
+    OS << "/*OverloadedName=*/nullptr, ";
   else
-    OS << "\"" << Record.OverloadedName << "\",";
-  OS << "{";
-  for (uint32_t Exts : Record.RequiredExtensions)
-    OS << Exts << ',';
-  OS << "},";
-  OS << Record.PrototypeIndex << ",";
-  OS << Record.SuffixIndex << ",";
-  OS << Record.OverloadedSuffixIndex << ",";
-  OS << (int)Record.PrototypeLength << ",";
-  OS << (int)Record.SuffixLength << ",";
-  OS << (int)Record.OverloadedSuffixSize << ",";
-  OS << (int)Record.TypeRangeMask << ",";
-  OS << (int)Record.Log2LMULMask << ",";
-  OS << (int)Record.NF << ",";
-  OS << (int)Record.HasMasked << ",";
-  OS << (int)Record.HasVL << ",";
-  OS << (int)Record.HasMaskedOffOperand << ",";
-  OS << (int)Record.HasTailPolicy << ",";
-  OS << (int)Record.HasMaskPolicy << ",";
-  OS << (int)Record.HasFRMRoundModeOp << ",";
-  OS << (int)Record.IsTuple << ",";
-  OS << (int)Record.UnMaskedPolicyScheme << ",";
-  OS << (int)Record.MaskedPolicyScheme << ",";
+    OS << "/*OverloadedName=*/\"" << Record.OverloadedName << "\", ";
+  OS << "/*RequiredExtensions=*/" << Record.RequiredExtensions << ", ";
+  OS << "/*PrototypeIndex=*/" << Record.PrototypeIndex << ", ";
+  OS << "/*SuffixIndex=*/" << Record.SuffixIndex << ", ";
+  OS << "/*OverloadedSuffixIndex=*/" << Record.OverloadedSuffixIndex << ", ";
+  OS << "/*PrototypeLength=*/" << (int)Record.PrototypeLength << ", ";
+  OS << "/*SuffixLength=*/" << (int)Record.SuffixLength << ", ";
+  OS << "/*OverloadedSuffixSize=*/" << (int)Record.OverloadedSuffixSize << ", ";
+  OS << "/*TypeRangeMask=*/" << (int)Record.TypeRangeMask << ", ";
+  OS << "/*Log2LMULMask=*/" << (int)Record.Log2LMULMask << ", ";
+  OS << "/*NF=*/" << (int)Record.NF << ", ";
+  OS << "/*HasMasked=*/" << (int)Record.HasMasked << ", ";
+  OS << "/*HasVL=*/" << (int)Record.HasVL << ", ";
+  OS << "/*HasMaskedOffOperand=*/" << (int)Record.HasMaskedOffOperand << ", ";
+  OS << "/*HasTailPolicy=*/" << (int)Record.HasTailPolicy << ", ";
+  OS << "/*HasMaskPolicy=*/" << (int)Record.HasMaskPolicy << ", ";
+  OS << "/*HasFRMRoundModeOp=*/" << (int)Record.HasFRMRoundModeOp << ", ";
+  OS << "/*IsTuple=*/" << (int)Record.IsTuple << ", ";
+  OS << "/*UnMaskedPolicyScheme=*/" << (PolicyScheme)Record.UnMaskedPolicyScheme
+     << ", ";
+  OS << "/*MaskedPolicyScheme=*/" << (PolicyScheme)Record.MaskedPolicyScheme
+     << ", ";
   OS << "},\n";
   return OS;
 }
diff --git a/clang/test/AST/ByteCode/codegen.cpp b/clang/test/AST/ByteCode/codegen.cpp
index ea2c812f30f6..7c853a20362b 100644
--- a/clang/test/AST/ByteCode/codegen.cpp
+++ b/clang/test/AST/ByteCode/codegen.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -o - %s                                         | FileCheck %s
 // RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -o - %s -fexperimental-new-constant-interpreter | FileCheck %s
 
 #ifdef __SIZEOF_INT128__
@@ -95,3 +95,12 @@ void f(A *a) {
   // CHECK: call void @_ZN1AD1Ev(
   A::E e3 = A().Foo;
 }
+
+int notdead() {
+  auto l = [c=0]() mutable {
+    return  c++ < 5 ? 10 : 12;
+  };
+  return l();
+}
+// CHECK: _ZZ7notdeadvEN3$_0clEv
+// CHECK: ret i32 %cond
diff --git a/clang/test/CodeGen/AArch64/sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.c b/clang/test/CodeGen/AArch64/sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.c
index bbed683ac1fd..c3d0541229fa 100644
--- a/clang/test/CodeGen/AArch64/sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.c
+++ b/clang/test/CodeGen/AArch64/sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.c
@@ -52,9 +52,8 @@ vec2048 x2048 = {0, 1, 2, 3, 3 , 2 , 1, 0, 0, 1, 2, 3, 3 , 2 , 1, 0,
 typedef int8_t vec_int8 __attribute__((vector_size(N / 8)));
 // CHECK128-LABEL: define{{.*}} <16 x i8> @f2(<16 x i8> noundef %x)
 // CHECK128-NEXT:  entry:
-// CHECK128-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
 // CHECK128-NEXT:    [[CASTSCALABLESVE:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> poison, <16 x i8> [[X:%.*]], i64 0)
-// CHECK128-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.asrd.nxv16i8(<vscale x 16 x i1> [[TMP0]], <vscale x 16 x i8> [[CASTSCALABLESVE]], i32 1)
+// CHECK128-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.asrd.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> [[CASTSCALABLESVE]], i32 1)
 // CHECK128-NEXT:    [[CASTFIXEDSVE:%.*]] = tail call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8> [[TMP1]], i64 0)
 // CHECK128-NEXT:    ret <16 x i8> [[CASTFIXEDSVE]]
 
@@ -62,9 +61,8 @@ typedef int8_t vec_int8 __attribute__((vector_size(N / 8)));
 // CHECK-SAME:   ptr dead_on_unwind noalias writable writeonly sret(<[[#div(VBITS,8)]] x i8>) align 16 captures(none) initializes((0, [[#div(VBITS,8)]])) %agg.result, ptr noundef readonly captures(none) %0)
 // CHECK-NEXT: entry:
 // CHECK-NEXT:   [[X:%.*]] = load <[[#div(VBITS,8)]] x i8>, ptr [[TMP0:%.*]], align 16, [[TBAA6:!tbaa !.*]]
-// CHECK-NEXT:   [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
 // CHECK-NEXT:   [[CASTSCALABLESVE:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v[[#div(VBITS,8)]]i8(<vscale x 16 x i8> poison, <[[#div(VBITS,8)]] x i8> [[X]], i64 0)
-// CHECK-NEXT:   [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.asrd.nxv16i8(<vscale x 16 x i1> [[TMP1]], <vscale x 16 x i8> [[CASTSCALABLESVE]], i32 1)
+// CHECK-NEXT:   [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.asrd.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> [[CASTSCALABLESVE]], i32 1)
 // CHECK-NEXT:   [[CASTFIXEDSVE:%.*]] = tail call <[[#div(VBITS,8)]] x i8> @llvm.vector.extract.v[[#div(VBITS,8)]]i8.nxv16i8(<vscale x 16 x i8> [[TMP2]], i64 0)
 // CHECK-NEXT:   store <[[#div(VBITS,8)]] x i8> [[CASTFIXEDSVE]], ptr [[AGG_RESULT:%.*]], align 16, [[TBAA6]]
 // CHECK-NEXT:   ret void
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_rdffr.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_rdffr.c
index 6bf56bdea505..ca3480d62725 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_rdffr.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_rdffr.c
@@ -7,14 +7,12 @@
 
 // CHECK-LABEL: @test_svrdffr(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.rdffr.z(<vscale x 16 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.rdffr.z(<vscale x 16 x i1> splat (i1 true))
 // CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z12test_svrdffrv(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.rdffr.z(<vscale x 16 x i1> [[TMP0]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.rdffr.z(<vscale x 16 x i1> splat (i1 true))
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
 //
 svbool_t test_svrdffr()
diff --git a/clang/test/CodeGen/attr-target-x86.c b/clang/test/CodeGen/attr-target-x86.c
index c92aad633082..e5067c1c3b07 100644
--- a/clang/test/CodeGen/attr-target-x86.c
+++ b/clang/test/CodeGen/attr-target-x86.c
@@ -56,7 +56,7 @@ void f_default2(void) {
 __attribute__((target("avx,      sse4.2,      arch=   ivybridge")))
 void f_avx_sse4_2_ivybridge_2(void) {}
 
-// CHECK: [[f_no_aes_ivybridge]] = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cmov,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-aes,-amx-avx512,-avx10.1-256,-avx10.1-512,-avx10.2-256,-avx10.2-512,-vaes"
+// CHECK: [[f_no_aes_ivybridge]] = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cmov,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-aes,-vaes"
 __attribute__((target("no-aes, arch=ivybridge")))
 void f_no_aes_ivybridge(void) {}
 
@@ -98,11 +98,11 @@ void f_x86_64_v3(void) {}
 __attribute__((target("arch=x86-64-v4")))
 void f_x86_64_v4(void) {}
 
-// CHECK: [[f_avx10_1_256]] = {{.*}}"target-cpu"="i686" "target-features"="+aes,+avx,+avx10.1-256,+avx2,+avx512bf16,+avx512bitalg,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512fp16,+avx512ifma,+avx512vbmi,+avx512vbmi2,+avx512vl,+avx512vnni,+avx512vpopcntdq,+cmov,+crc32,+cx8,+f16c,+fma,+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+vaes,+vpclmulqdq,+x87,+xsave,-amx-avx512,-avx10.1-512,-avx10.2-512,-evex512"
+// CHECK: [[f_avx10_1_256]] = {{.*}}"target-cpu"="i686" "target-features"="+avx,+avx10.1-256,+avx2,+avx512bf16,+avx512bitalg,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512fp16,+avx512ifma,+avx512vbmi,+avx512vbmi2,+avx512vl,+avx512vnni,+avx512vpopcntdq,+cmov,+crc32,+cx8,+f16c,+fma,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,-amx-avx512,-avx10.1-512,-avx10.2-512,-evex512"
 __attribute__((target("avx10.1-256")))
 void f_avx10_1_256(void) {}
 
-// CHECK: [[f_avx10_1_512]] = {{.*}}"target-cpu"="i686" "target-features"="+aes,+avx,+avx10.1-256,+avx10.1-512,+avx2,+avx512bf16,+avx512bitalg,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512fp16,+avx512ifma,+avx512vbmi,+avx512vbmi2,+avx512vl,+avx512vnni,+avx512vpopcntdq,+cmov,+crc32,+cx8,+evex512,+f16c,+fma,+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+vaes,+vpclmulqdq,+x87,+xsave"
+// CHECK: [[f_avx10_1_512]] = {{.*}}"target-cpu"="i686" "target-features"="+avx,+avx10.1-256,+avx10.1-512,+avx2,+avx512bf16,+avx512bitalg,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512fp16,+avx512ifma,+avx512vbmi,+avx512vbmi2,+avx512vl,+avx512vnni,+avx512vpopcntdq,+cmov,+crc32,+cx8,+evex512,+f16c,+fma,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave"
 __attribute__((target("avx10.1-512")))
 void f_avx10_1_512(void) {}
 
@@ -112,4 +112,4 @@ void f_prefer_256_bit(void) {}
 
 // CHECK: [[f_no_prefer_256_bit]] = {{.*}}"target-features"="{{.*}}-prefer-256-bit
 __attribute__((target("no-prefer-256-bit")))
-void f_no_prefer_256_bit(void) {}
-\ No newline at end of file
+void f_no_prefer_256_bit(void) {}
diff --git a/clang/test/CodeGen/builtin-maxnum-minnum.c b/clang/test/CodeGen/builtin-maxnum-minnum.c
new file mode 100644
index 000000000000..69cec72495d3
--- /dev/null
+++ b/clang/test/CodeGen/builtin-maxnum-minnum.c
@@ -0,0 +1,171 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -x c++ -std=c++20 -disable-llvm-passes -O3 -triple x86_64 %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK
+
+typedef _Float16 half8 __attribute__((ext_vector_type(8)));
+typedef __bf16 bf16x8 __attribute__((ext_vector_type(8)));
+typedef float float4 __attribute__((ext_vector_type(4)));
+typedef double double2 __attribute__((ext_vector_type(2)));
+typedef long double ldouble2 __attribute__((ext_vector_type(2)));
+
+// CHECK-LABEL: define dso_local noundef <8 x half> @_Z7pfmin16Dv8_DF16_S_(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[ELT_MINNUM:%.*]] = call <8 x half> @llvm.minnum.v8f16(<8 x half> [[TMP0]], <8 x half> [[TMP1]])
+// CHECK-NEXT:    ret <8 x half> [[ELT_MINNUM]]
+//
+half8 pfmin16(half8 a, half8 b) {
+	return __builtin_elementwise_minnum(a, b);
+}
+// CHECK-LABEL: define dso_local noundef <8 x bfloat> @_Z8pfmin16bDv8_DF16bS_(
+// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]], <8 x bfloat> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x bfloat>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x bfloat>, align 16
+// CHECK-NEXT:    store <8 x bfloat> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x bfloat> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x bfloat>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x bfloat>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[ELT_MINNUM:%.*]] = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]])
+// CHECK-NEXT:    ret <8 x bfloat> [[ELT_MINNUM]]
+//
+bf16x8 pfmin16b(bf16x8 a, bf16x8 b) {
+	return __builtin_elementwise_minnum(a, b);
+}
+// CHECK-LABEL: define dso_local noundef <4 x float> @_Z7pfmin32Dv4_fS_(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[ELT_MINNUM:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    ret <4 x float> [[ELT_MINNUM]]
+//
+float4 pfmin32(float4 a, float4 b) {
+	return __builtin_elementwise_minnum(a, b);
+}
+// CHECK-LABEL: define dso_local noundef <2 x double> @_Z7pfmin64Dv2_dS_(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[ELT_MINNUM:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    ret <2 x double> [[ELT_MINNUM]]
+//
+double2 pfmin64(double2 a, double2 b) {
+	return __builtin_elementwise_minnum(a, b);
+}
+// CHECK-LABEL: define dso_local noundef <2 x x86_fp80> @_Z7pfmin80Dv2_eS_(
+// CHECK-SAME: ptr noundef byval(<2 x x86_fp80>) align 32 [[TMP0:%.*]], ptr noundef byval(<2 x x86_fp80>) align 32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <2 x x86_fp80>, align 32
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x x86_fp80>, align 32
+// CHECK-NEXT:    [[A:%.*]] = load <2 x x86_fp80>, ptr [[TMP0]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[B:%.*]] = load <2 x x86_fp80>, ptr [[TMP1]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <2 x x86_fp80> [[A]], ptr [[A_ADDR]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <2 x x86_fp80> [[B]], ptr [[B_ADDR]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x x86_fp80>, ptr [[A_ADDR]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x x86_fp80>, ptr [[B_ADDR]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[ELT_MINNUM:%.*]] = call <2 x x86_fp80> @llvm.minnum.v2f80(<2 x x86_fp80> [[TMP2]], <2 x x86_fp80> [[TMP3]])
+// CHECK-NEXT:    ret <2 x x86_fp80> [[ELT_MINNUM]]
+//
+ldouble2 pfmin80(ldouble2 a, ldouble2 b) {
+	return __builtin_elementwise_minnum(a, b);
+}
+
+// CHECK-LABEL: define dso_local noundef <8 x half> @_Z7pfmax16Dv8_DF16_S_(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[ELT_MAXNUM:%.*]] = call <8 x half> @llvm.maxnum.v8f16(<8 x half> [[TMP0]], <8 x half> [[TMP1]])
+// CHECK-NEXT:    ret <8 x half> [[ELT_MAXNUM]]
+//
+half8 pfmax16(half8 a, half8 b) {
+	return __builtin_elementwise_maxnum(a, b);
+}
+// CHECK-LABEL: define dso_local noundef <8 x bfloat> @_Z8pfmax16bDv8_DF16bS_(
+// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]], <8 x bfloat> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x bfloat>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x bfloat>, align 16
+// CHECK-NEXT:    store <8 x bfloat> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x bfloat> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x bfloat>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x bfloat>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[ELT_MAXNUM:%.*]] = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]])
+// CHECK-NEXT:    ret <8 x bfloat> [[ELT_MAXNUM]]
+//
+bf16x8 pfmax16b(bf16x8 a, bf16x8 b) {
+	return __builtin_elementwise_maxnum(a, b);
+}
+// CHECK-LABEL: define dso_local noundef <4 x float> @_Z7pfmax32Dv4_fS_(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[ELT_MAXNUM:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    ret <4 x float> [[ELT_MAXNUM]]
+//
+float4 pfmax32(float4 a, float4 b) {
+	return __builtin_elementwise_maxnum(a, b);
+}
+// CHECK-LABEL: define dso_local noundef <2 x double> @_Z7pfmax64Dv2_dS_(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[ELT_MAXNUM:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    ret <2 x double> [[ELT_MAXNUM]]
+//
+double2 pfmax64(double2 a, double2 b) {
+	return __builtin_elementwise_maxnum(a, b);
+}
+
+// CHECK-LABEL: define dso_local noundef <2 x x86_fp80> @_Z7pfmax80Dv2_eS_(
+// CHECK-SAME: ptr noundef byval(<2 x x86_fp80>) align 32 [[TMP0:%.*]], ptr noundef byval(<2 x x86_fp80>) align 32 [[TMP1:%.*]]) #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <2 x x86_fp80>, align 32
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x x86_fp80>, align 32
+// CHECK-NEXT:    [[A:%.*]] = load <2 x x86_fp80>, ptr [[TMP0]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[B:%.*]] = load <2 x x86_fp80>, ptr [[TMP1]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <2 x x86_fp80> [[A]], ptr [[A_ADDR]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <2 x x86_fp80> [[B]], ptr [[B_ADDR]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x x86_fp80>, ptr [[A_ADDR]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x x86_fp80>, ptr [[B_ADDR]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[ELT_MINNUM:%.*]] = call <2 x x86_fp80> @llvm.minnum.v2f80(<2 x x86_fp80> [[TMP2]], <2 x x86_fp80> [[TMP3]])
+// CHECK-NEXT:    ret <2 x x86_fp80> [[ELT_MINNUM]]
+//
+ldouble2 pfmax80(ldouble2 a, ldouble2 b) {
+	return __builtin_elementwise_minnum(a, b);
+}
+
+//.
+// CHECK: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+// CHECK: [[META4]] = !{!"Simple C++ TBAA"}
+//.
diff --git a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
index 789aae7a5c34..49604c6c5e61 100644
--- a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
+++ b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
@@ -546,12 +546,10 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 //
 //
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember(
-// AMDGCN-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN-SAME: ptr addrspace(4) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
 // AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
-// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
-// AMDGCN-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0
-// AMDGCN-NEXT:    store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 800, i1 false)
 // AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR4]]
 // AMDGCN-NEXT:    ret void
 //
@@ -596,20 +594,15 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 //
 //
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember(
-// AMDGCN-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN-SAME: ptr addrspace(4) noundef byref([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
 // AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5)
-// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
-// AMDGCN-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0
-// AMDGCN-NEXT:    store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
-// AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
-// AMDGCN-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1
-// AMDGCN-NEXT:    store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8
-// AMDGCN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
-// AMDGCN-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8
-// AMDGCN-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
-// AMDGCN-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR4]]
+// AMDGCN-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 16, i1 false)
+// AMDGCN-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP1]], align 8
+// AMDGCN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP3]], align 8
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP2]], <2 x i32> [[TMP4]]) #[[ATTR4]]
 // AMDGCN-NEXT:    ret void
 //
 //
@@ -630,15 +623,10 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 //
 //
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember(
-// AMDGCN-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN-SAME: ptr addrspace(4) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
 // AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
-// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
-// AMDGCN-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0
-// AMDGCN-NEXT:    store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
-// AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
-// AMDGCN-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1
-// AMDGCN-NEXT:    store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8
+// AMDGCN-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 480, i1 false)
 // AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR4]]
 // AMDGCN-NEXT:    ret void
 //
@@ -868,15 +856,13 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 //
 //
 // AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember(
-// AMDGCN20-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN20-SAME: ptr addrspace(4) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
 // AMDGCN20-NEXT:  [[ENTRY:.*:]]
-// AMDGCN20-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
-// AMDGCN20-NEXT:    [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr
-// AMDGCN20-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN20-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0
-// AMDGCN20-NEXT:    store [100 x <2 x i32>] [[TMP1]], ptr [[TMP0]], align 8
-// AMDGCN20-NEXT:    [[U1_ASCAST:%.*]] = addrspacecast ptr [[U1]] to ptr addrspace(5)
-// AMDGCN20-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U1_ASCAST]]) #[[ATTR4]]
+// AMDGCN20-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
+// AMDGCN20-NEXT:    [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
+// AMDGCN20-NEXT:    call void @llvm.memcpy.p0.p4.i64(ptr align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 800, i1 false)
+// AMDGCN20-NEXT:    [[U_ASCAST:%.*]] = addrspacecast ptr [[U]] to ptr addrspace(5)
+// AMDGCN20-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U_ASCAST]]) #[[ATTR4]]
 // AMDGCN20-NEXT:    ret void
 //
 //
@@ -927,21 +913,16 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 //
 //
 // AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember(
-// AMDGCN20-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN20-SAME: ptr addrspace(4) noundef byref([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
 // AMDGCN20-NEXT:  [[ENTRY:.*:]]
-// AMDGCN20-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5)
-// AMDGCN20-NEXT:    [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr
-// AMDGCN20-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN20-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0
-// AMDGCN20-NEXT:    store <2 x i32> [[TMP1]], ptr [[TMP0]], align 8
-// AMDGCN20-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1
-// AMDGCN20-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1
-// AMDGCN20-NEXT:    store <2 x i32> [[TMP3]], ptr [[TMP2]], align 8
-// AMDGCN20-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN20-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr [[TMP4]], align 8
-// AMDGCN20-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1
-// AMDGCN20-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8
-// AMDGCN20-NEXT:    call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR4]]
+// AMDGCN20-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN20-NEXT:    [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
+// AMDGCN20-NEXT:    call void @llvm.memcpy.p0.p4.i64(ptr align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 16, i1 false)
+// AMDGCN20-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 0
+// AMDGCN20-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8
+// AMDGCN20-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1
+// AMDGCN20-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[TMP3]], align 8
+// AMDGCN20-NEXT:    call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP2]], <2 x i32> [[TMP4]]) #[[ATTR4]]
 // AMDGCN20-NEXT:    ret void
 //
 //
@@ -963,18 +944,13 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 //
 //
 // AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember(
-// AMDGCN20-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN20-SAME: ptr addrspace(4) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] {
 // AMDGCN20-NEXT:  [[ENTRY:.*:]]
-// AMDGCN20-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
-// AMDGCN20-NEXT:    [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr
-// AMDGCN20-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN20-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0
-// AMDGCN20-NEXT:    store [40 x <2 x i32>] [[TMP1]], ptr [[TMP0]], align 8
-// AMDGCN20-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1
-// AMDGCN20-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1
-// AMDGCN20-NEXT:    store [20 x <2 x i32>] [[TMP3]], ptr [[TMP2]], align 8
-// AMDGCN20-NEXT:    [[U1_ASCAST:%.*]] = addrspacecast ptr [[U1]] to ptr addrspace(5)
-// AMDGCN20-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U1_ASCAST]]) #[[ATTR4]]
+// AMDGCN20-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN20-NEXT:    [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
+// AMDGCN20-NEXT:    call void @llvm.memcpy.p0.p4.i64(ptr align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 480, i1 false)
+// AMDGCN20-NEXT:    [[U_ASCAST:%.*]] = addrspacecast ptr [[U]] to ptr addrspace(5)
+// AMDGCN20-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U_ASCAST]]) #[[ATTR4]]
 // AMDGCN20-NEXT:    ret void
 //
 //
@@ -1408,12 +1384,10 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 //
 //
 // AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember(
-// AMDGCN30-GVAR-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN30-GVAR-SAME: ptr addrspace(4) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
 // AMDGCN30-GVAR-NEXT:  [[ENTRY:.*:]]
 // AMDGCN30-GVAR-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
-// AMDGCN30-GVAR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
-// AMDGCN30-GVAR-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0
-// AMDGCN30-GVAR-NEXT:    store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN30-GVAR-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 800, i1 false)
 // AMDGCN30-GVAR-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR4]]
 // AMDGCN30-GVAR-NEXT:    ret void
 //
@@ -1458,20 +1432,15 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 //
 //
 // AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember(
-// AMDGCN30-GVAR-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN30-GVAR-SAME: ptr addrspace(4) noundef byref([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
 // AMDGCN30-GVAR-NEXT:  [[ENTRY:.*:]]
 // AMDGCN30-GVAR-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5)
-// AMDGCN30-GVAR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
-// AMDGCN30-GVAR-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0
-// AMDGCN30-GVAR-NEXT:    store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
-// AMDGCN30-GVAR-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
-// AMDGCN30-GVAR-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1
-// AMDGCN30-GVAR-NEXT:    store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8
-// AMDGCN30-GVAR-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
-// AMDGCN30-GVAR-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8
-// AMDGCN30-GVAR-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
-// AMDGCN30-GVAR-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8
-// AMDGCN30-GVAR-NEXT:    call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR4]]
+// AMDGCN30-GVAR-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 16, i1 false)
+// AMDGCN30-GVAR-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-GVAR-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP1]], align 8
+// AMDGCN30-GVAR-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN30-GVAR-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP3]], align 8
+// AMDGCN30-GVAR-NEXT:    call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP2]], <2 x i32> [[TMP4]]) #[[ATTR4]]
 // AMDGCN30-GVAR-NEXT:    ret void
 //
 //
@@ -1492,15 +1461,10 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 //
 //
 // AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember(
-// AMDGCN30-GVAR-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN30-GVAR-SAME: ptr addrspace(4) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] {
 // AMDGCN30-GVAR-NEXT:  [[ENTRY:.*:]]
 // AMDGCN30-GVAR-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
-// AMDGCN30-GVAR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
-// AMDGCN30-GVAR-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0
-// AMDGCN30-GVAR-NEXT:    store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
-// AMDGCN30-GVAR-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
-// AMDGCN30-GVAR-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1
-// AMDGCN30-GVAR-NEXT:    store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8
+// AMDGCN30-GVAR-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 480, i1 false)
 // AMDGCN30-GVAR-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR4]]
 // AMDGCN30-GVAR-NEXT:    ret void
 //
@@ -1699,12 +1663,10 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 //
 //
 // AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember(
-// AMDGCN30-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN30-SAME: ptr addrspace(4) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
 // AMDGCN30-NEXT:  [[ENTRY:.*:]]
 // AMDGCN30-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
-// AMDGCN30-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
-// AMDGCN30-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0
-// AMDGCN30-NEXT:    store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN30-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 800, i1 false)
 // AMDGCN30-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR4]]
 // AMDGCN30-NEXT:    ret void
 //
@@ -1749,20 +1711,15 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 //
 //
 // AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember(
-// AMDGCN30-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN30-SAME: ptr addrspace(4) noundef byref([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
 // AMDGCN30-NEXT:  [[ENTRY:.*:]]
 // AMDGCN30-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5)
-// AMDGCN30-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
-// AMDGCN30-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0
-// AMDGCN30-NEXT:    store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
-// AMDGCN30-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
-// AMDGCN30-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1
-// AMDGCN30-NEXT:    store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8
-// AMDGCN30-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
-// AMDGCN30-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8
-// AMDGCN30-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
-// AMDGCN30-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8
-// AMDGCN30-NEXT:    call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR4]]
+// AMDGCN30-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 16, i1 false)
+// AMDGCN30-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP1]], align 8
+// AMDGCN30-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN30-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP3]], align 8
+// AMDGCN30-NEXT:    call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP2]], <2 x i32> [[TMP4]]) #[[ATTR4]]
 // AMDGCN30-NEXT:    ret void
 //
 //
@@ -1783,15 +1740,10 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 //
 //
 // AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember(
-// AMDGCN30-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN30-SAME: ptr addrspace(4) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] {
 // AMDGCN30-NEXT:  [[ENTRY:.*:]]
 // AMDGCN30-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
-// AMDGCN30-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
-// AMDGCN30-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0
-// AMDGCN30-NEXT:    store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
-// AMDGCN30-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
-// AMDGCN30-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1
-// AMDGCN30-NEXT:    store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8
+// AMDGCN30-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 480, i1 false)
 // AMDGCN30-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR4]]
 // AMDGCN30-NEXT:    ret void
 //
diff --git a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl
index 6dc488c40da7..7d0a66bac146 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 --include-generated-funcs 
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 5
 // RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL2.0 -O0 -triple amdgcn | FileCheck -check-prefix=AMDGCN %s
 
 typedef int int2 __attribute__((ext_vector_type(2)));
@@ -330,15 +330,13 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 //
 //
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember(
-// AMDGCN-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN-SAME: ptr addrspace(4) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
-// AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
-// AMDGCN-NEXT:    [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr
-// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0
-// AMDGCN-NEXT:    store [100 x <2 x i32>] [[TMP1]], ptr [[TMP0]], align 8
-// AMDGCN-NEXT:    [[U1_ASCAST:%.*]] = addrspacecast ptr [[U1]] to ptr addrspace(5)
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U1_ASCAST]]) #[[ATTR4]]
+// AMDGCN-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
+// AMDGCN-NEXT:    [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
+// AMDGCN-NEXT:    call void @llvm.memcpy.p0.p4.i64(ptr align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 800, i1 false)
+// AMDGCN-NEXT:    [[U_ASCAST:%.*]] = addrspacecast ptr [[U]] to ptr addrspace(5)
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U_ASCAST]]) #[[ATTR4]]
 // AMDGCN-NEXT:    ret void
 //
 //
@@ -389,21 +387,16 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 //
 //
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember(
-// AMDGCN-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN-SAME: ptr addrspace(4) noundef byref([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
-// AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5)
-// AMDGCN-NEXT:    [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr
-// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0
-// AMDGCN-NEXT:    store <2 x i32> [[TMP1]], ptr [[TMP0]], align 8
-// AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1
-// AMDGCN-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1
-// AMDGCN-NEXT:    store <2 x i32> [[TMP3]], ptr [[TMP2]], align 8
-// AMDGCN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr [[TMP4]], align 8
-// AMDGCN-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1
-// AMDGCN-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR4]]
+// AMDGCN-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN-NEXT:    [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
+// AMDGCN-NEXT:    call void @llvm.memcpy.p0.p4.i64(ptr align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 16, i1 false)
+// AMDGCN-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 0
+// AMDGCN-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8
+// AMDGCN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1
+// AMDGCN-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[TMP3]], align 8
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP2]], <2 x i32> [[TMP4]]) #[[ATTR4]]
 // AMDGCN-NEXT:    ret void
 //
 //
@@ -425,18 +418,13 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
 //
 //
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember(
-// AMDGCN-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN-SAME: ptr addrspace(4) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
-// AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
-// AMDGCN-NEXT:    [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr
-// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0
-// AMDGCN-NEXT:    store [40 x <2 x i32>] [[TMP1]], ptr [[TMP0]], align 8
-// AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1
-// AMDGCN-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1
-// AMDGCN-NEXT:    store [20 x <2 x i32>] [[TMP3]], ptr [[TMP2]], align 8
-// AMDGCN-NEXT:    [[U1_ASCAST:%.*]] = addrspacecast ptr [[U1]] to ptr addrspace(5)
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U1_ASCAST]]) #[[ATTR4]]
+// AMDGCN-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN-NEXT:    [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
+// AMDGCN-NEXT:    call void @llvm.memcpy.p0.p4.i64(ptr align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 480, i1 false)
+// AMDGCN-NEXT:    [[U_ASCAST:%.*]] = addrspacecast ptr [[U]] to ptr addrspace(5)
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U_ASCAST]]) #[[ATTR4]]
 // AMDGCN-NEXT:    ret void
 //
 //
diff --git a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl
index 4d09fc3ffb70..06d3cdb01deb 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl
@@ -214,7 +214,8 @@ typedef struct struct_4regs
   int w;
 } struct_4regs;
 
-// CHECK: void @kernel_empty_struct_arg(%struct.empty_struct %s.coerce)
+// CHECK: void @kernel_empty_struct_arg(ptr addrspace(4) noundef readnone byref(%struct.empty_struct) align 1 captures(none) {{%.+}})
+// CHECK: void @__clang_ocl_kern_imp_kernel_empty_struct_arg()
 __kernel void kernel_empty_struct_arg(empty_struct s) { }
 
 // CHECK: void @kernel_single_element_struct_arg(i32 %arg1.coerce)
@@ -223,28 +224,35 @@ __kernel void kernel_single_element_struct_arg(single_element_struct_arg_t arg1)
 // CHECK: void @kernel_nested_single_element_struct_arg(i32 %arg1.coerce)
 __kernel void kernel_nested_single_element_struct_arg(nested_single_element_struct_arg_t arg1) { }
 
-// CHECK: void @kernel_struct_arg(%struct.struct_arg %arg1.coerce)
+// CHECK: void @kernel_struct_arg(ptr addrspace(4) noundef readonly byref(%struct.struct_arg) align 4 captures(none) {{%.+}})
+// CHECK: void @__clang_ocl_kern_imp_kernel_struct_arg(i32 %arg1.coerce0, float %arg1.coerce1, i32 %arg1.coerce2)
 __kernel void kernel_struct_arg(struct_arg_t arg1) { }
 
-// CHECK: void @kernel_struct_padding_arg(%struct.struct_padding_arg %arg1.coerce)
+// CHECK: void @kernel_struct_padding_arg(ptr addrspace(4) noundef readonly byref(%struct.struct_padding_arg) align 8 captures(none) {{%.+}})
+// CHECK: void @__clang_ocl_kern_imp_kernel_struct_padding_arg(i8 %arg1.coerce0, i64 %arg1.coerce1)
 __kernel void kernel_struct_padding_arg(struct_padding_arg arg1) { }
 
-// CHECK: void @kernel_test_struct_of_arrays_arg(%struct.struct_of_arrays_arg %arg1.coerce)
+// CHECK: void @kernel_test_struct_of_arrays_arg(ptr addrspace(4) noundef readonly byref(%struct.struct_of_arrays_arg) align 4 captures(none) {{%.+}})
+// CHECK: void @__clang_ocl_kern_imp_kernel_test_struct_of_arrays_arg([2 x i32] %arg1.coerce0, float %arg1.coerce1, [4 x i32] %arg1.coerce2, [3 x float] %arg1.coerce3, i32 %arg1.coerce4)
 __kernel void kernel_test_struct_of_arrays_arg(struct_of_arrays_arg_t arg1) { }
 
-// CHECK: void @kernel_struct_of_structs_arg(%struct.struct_of_structs_arg %arg1.coerce)
+// CHECK: void @kernel_struct_of_structs_arg(ptr addrspace(4) noundef readonly byref(%struct.struct_of_structs_arg) align 4 captures(none) {{%.+}})
+// CHECK: void @__clang_ocl_kern_imp_kernel_struct_of_structs_arg(i32 %arg1.coerce0, float %arg1.coerce1, %struct.struct_arg %arg1.coerce2, i32 %arg1.coerce3)
 __kernel void kernel_struct_of_structs_arg(struct_of_structs_arg_t arg1) { }
 
 // CHECK: void @test_kernel_transparent_union_arg(i32 %u.coerce)
 __kernel void test_kernel_transparent_union_arg(transparent_u u) { }
 
-// CHECK: void @kernel_single_array_element_struct_arg(%struct.single_array_element_struct_arg %arg1.coerce)
+// CHECK: void @kernel_single_array_element_struct_arg(ptr addrspace(4) noundef readonly byref(%struct.single_array_element_struct_arg) align 4 captures(none) {{%.+}})
+// CHECK: void @__clang_ocl_kern_imp_kernel_single_array_element_struct_arg([4 x i32] %arg1.coerce)
 __kernel void kernel_single_array_element_struct_arg(single_array_element_struct_arg_t arg1) { }
 
-// CHECK: void @kernel_single_struct_element_struct_arg(%struct.single_struct_element_struct_arg %arg1.coerce)
+// CHECK: void @kernel_single_struct_element_struct_arg(ptr addrspace(4) noundef readonly byref(%struct.single_struct_element_struct_arg) align 8 captures(none) {{%.+}})
+// CHECK: void @__clang_ocl_kern_imp_kernel_single_struct_element_struct_arg(%struct.inner %arg1.coerce)
 __kernel void kernel_single_struct_element_struct_arg(single_struct_element_struct_arg_t arg1) { }
 
-// CHECK: void @kernel_different_size_type_pair_arg(%struct.different_size_type_pair %arg1.coerce)
+// CHECK: void @kernel_different_size_type_pair_arg(ptr addrspace(4) noundef readonly byref(%struct.different_size_type_pair) align 8 captures(none) {{%.+}})
+// CHECK: void @__clang_ocl_kern_imp_kernel_different_size_type_pair_arg(i64 %arg1.coerce0, i32 %arg1.coerce1)
 __kernel void kernel_different_size_type_pair_arg(different_size_type_pair arg1) { }
 
 // CHECK: define{{.*}} void @func_f32_arg(float noundef %arg)
diff --git a/clang/test/CodeGenOpenCL/opencl-kernel-call.cl b/clang/test/CodeGenOpenCL/opencl-kernel-call.cl
index cdbe510b723b..a5b2bee127bd 100644
--- a/clang/test/CodeGenOpenCL/opencl-kernel-call.cl
+++ b/clang/test/CodeGenOpenCL/opencl-kernel-call.cl
@@ -676,12 +676,10 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 //
 // AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember(
-// AMDGCN-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META7]] {
+// AMDGCN-SAME: ptr addrspace(4) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META7]] {
 // AMDGCN-NEXT:  entry:
 // AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
-// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
-// AMDGCN-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0
-// AMDGCN-NEXT:    store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 800, i1 false)
 // AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR5]]
 // AMDGCN-NEXT:    ret void
 //
@@ -698,20 +696,15 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 //
 // AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember(
-// AMDGCN-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META7]] {
+// AMDGCN-SAME: ptr addrspace(4) noundef byref([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META7]] {
 // AMDGCN-NEXT:  entry:
 // AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5)
-// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
-// AMDGCN-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0
-// AMDGCN-NEXT:    store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
-// AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
-// AMDGCN-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1
-// AMDGCN-NEXT:    store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8
-// AMDGCN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
-// AMDGCN-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8
-// AMDGCN-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
-// AMDGCN-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR5]]
+// AMDGCN-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 16, i1 false)
+// AMDGCN-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP1]], align 8
+// AMDGCN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP3]], align 8
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP2]], <2 x i32> [[TMP4]]) #[[ATTR5]]
 // AMDGCN-NEXT:    ret void
 //
 //
@@ -734,15 +727,10 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 //
 // AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember(
-// AMDGCN-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META7]] {
+// AMDGCN-SAME: ptr addrspace(4) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META7]] {
 // AMDGCN-NEXT:  entry:
 // AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
-// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
-// AMDGCN-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0
-// AMDGCN-NEXT:    store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
-// AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
-// AMDGCN-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1
-// AMDGCN-NEXT:    store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8
+// AMDGCN-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 480, i1 false)
 // AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR5]]
 // AMDGCN-NEXT:    ret void
 //
@@ -815,28 +803,23 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 //
 // AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @caller_kern2(
-// AMDGCN-SAME: <2 x i32> [[STRUCTONEMEM_COERCE:%.*]], ptr addrspace(1) noundef align 8 [[GLOBAL_STRUCTONEMEM:%.*]], [[STRUCT_STRUCTTWOMEMBER:%.*]] [[STRUCTTWOMEM_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META22:![0-9]+]] !kernel_arg_access_qual [[META23:![0-9]+]] !kernel_arg_type [[META24:![0-9]+]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25:![0-9]+]] {
+// AMDGCN-SAME: <2 x i32> [[STRUCTONEMEM_COERCE:%.*]], ptr addrspace(1) noundef align 8 [[GLOBAL_STRUCTONEMEM:%.*]], ptr addrspace(4) noundef byref([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META22:![0-9]+]] !kernel_arg_access_qual [[META23:![0-9]+]] !kernel_arg_type [[META24:![0-9]+]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25:![0-9]+]] {
 // AMDGCN-NEXT:  entry:
 // AMDGCN-NEXT:    [[STRUCTONEMEM:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5)
 // AMDGCN-NEXT:    [[STRUCTTWOMEM:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5)
 // AMDGCN-NEXT:    [[GLOBAL_STRUCTONEMEM_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // AMDGCN-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[STRUCTONEMEM]], i32 0, i32 0
 // AMDGCN-NEXT:    store <2 x i32> [[STRUCTONEMEM_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8
-// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 0
-// AMDGCN-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[STRUCTTWOMEM_COERCE]], 0
-// AMDGCN-NEXT:    store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
-// AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 1
-// AMDGCN-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[STRUCTTWOMEM_COERCE]], 1
-// AMDGCN-NEXT:    store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8
+// AMDGCN-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[STRUCTTWOMEM]], ptr addrspace(4) align 8 [[TMP0]], i64 16, i1 false)
 // AMDGCN-NEXT:    store ptr addrspace(1) [[GLOBAL_STRUCTONEMEM]], ptr addrspace(5) [[GLOBAL_STRUCTONEMEM_ADDR]], align 8
-// AMDGCN-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[GLOBAL_STRUCTONEMEM_ADDR]], align 8
+// AMDGCN-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[GLOBAL_STRUCTONEMEM_ADDR]], align 8
 // AMDGCN-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[STRUCTONEMEM]], i32 0, i32 0
-// AMDGCN-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8
-// AMDGCN-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 0
-// AMDGCN-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8
-// AMDGCN-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 1
-// AMDGCN-NEXT:    [[TMP9:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP8]], align 8
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_caller_kern2(<2 x i32> [[TMP5]], ptr addrspace(1) noundef align 8 [[TMP4]], <2 x i32> [[TMP7]], <2 x i32> [[TMP9]]) #[[ATTR5]]
+// AMDGCN-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8
+// AMDGCN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 0
+// AMDGCN-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP3]], align 8
+// AMDGCN-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 1
+// AMDGCN-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP5]], align 8
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_caller_kern2(<2 x i32> [[TMP2]], ptr addrspace(1) noundef align 8 [[TMP1]], <2 x i32> [[TMP4]], <2 x i32> [[TMP6]]) #[[ATTR5]]
 // AMDGCN-NEXT:    ret void
 //
 //
@@ -875,19 +858,12 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct
 //
 // AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @caller_kern3(
-// AMDGCN-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[LARGESTRUCTONEMEM_COERCE:%.*]], [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[LARGESTRUCTTWOMEM_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META26:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META27:![0-9]+]] !kernel_arg_base_type [[META27]] !kernel_arg_type_qual [[META11]] {
+// AMDGCN-SAME: ptr addrspace(4) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]], ptr addrspace(4) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP1:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META26:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META27:![0-9]+]] !kernel_arg_base_type [[META27]] !kernel_arg_type_qual [[META11]] {
 // AMDGCN-NEXT:  entry:
 // AMDGCN-NEXT:    [[LARGESTRUCTONEMEM:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
 // AMDGCN-NEXT:    [[LARGESTRUCTTWOMEM:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
-// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[LARGESTRUCTONEMEM]], i32 0, i32 0
-// AMDGCN-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[LARGESTRUCTONEMEM_COERCE]], 0
-// AMDGCN-NEXT:    store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
-// AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[LARGESTRUCTTWOMEM]], i32 0, i32 0
-// AMDGCN-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[LARGESTRUCTTWOMEM_COERCE]], 0
-// AMDGCN-NEXT:    store [40 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8
-// AMDGCN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[LARGESTRUCTTWOMEM]], i32 0, i32 1
-// AMDGCN-NEXT:    [[TMP5:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[LARGESTRUCTTWOMEM_COERCE]], 1
-// AMDGCN-NEXT:    store [20 x <2 x i32>] [[TMP5]], ptr addrspace(5) [[TMP4]], align 8
+// AMDGCN-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[LARGESTRUCTONEMEM]], ptr addrspace(4) align 8 [[TMP0]], i64 800, i1 false)
+// AMDGCN-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[LARGESTRUCTTWOMEM]], ptr addrspace(4) align 8 [[TMP1]], i64 480, i1 false)
 // AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_caller_kern3(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[LARGESTRUCTONEMEM]], ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[LARGESTRUCTTWOMEM]]) #[[ATTR5]]
 // AMDGCN-NEXT:    ret void
 //
diff --git a/clang/test/Driver/amdgpu-openmp-toolchain.c b/clang/test/Driver/amdgpu-openmp-toolchain.c
index 743f4e74696c..1091e6e372ac 100644
--- a/clang/test/Driver/amdgpu-openmp-toolchain.c
+++ b/clang/test/Driver/amdgpu-openmp-toolchain.c
@@ -54,12 +54,12 @@
 // RUN: %clang -### -target x86_64-pc-linux-gnu -fopenmp --offload-arch=gfx803 \
 // RUN:   --no-offloadlib --offloadlib --rocm-device-lib-path=%S/Inputs/rocm/amdgcn/bitcode %s  2>&1 | \
 // RUN: FileCheck %s --check-prefix=CHECK-LIB-DEVICE
-// CHECK-LIB-DEVICE: "-cc1" {{.*}}ocml.bc"{{.*}}oclc_unsafe_math_off.bc"{{.*}}oclc_finite_only_off.bc"{{.*}}oclc_correctly_rounded_sqrt_on.bc"{{.*}}oclc_wavefrontsize64_on.bc"{{.*}}oclc_isa_version_803.bc"
+// CHECK-LIB-DEVICE: "-cc1" {{.*}}ocml.bc"{{.*}}oclc_daz_opt_on.bc"{{.*}}oclc_unsafe_math_off.bc"{{.*}}oclc_finite_only_off.bc"{{.*}}oclc_correctly_rounded_sqrt_on.bc"{{.*}}oclc_wavefrontsize64_on.bc"{{.*}}oclc_isa_version_803.bc"
 
 // RUN: %clang -### -target x86_64-pc-linux-gnu -fopenmp --offload-arch=gfx803 -nogpulib \
 // RUN:   --offloadlib --no-offloadlib --rocm-device-lib-path=%S/Inputs/rocm/amdgcn/bitcode %s  2>&1 | \
 // RUN: FileCheck %s --check-prefix=CHECK-LIB-DEVICE-NOGPULIB
-// CHECK-LIB-DEVICE-NOGPULIB-NOT: "-cc1" {{.*}}ocml.bc"{{.*}}oclc_unsafe_math_off.bc"{{.*}}oclc_finite_only_off.bc"{{.*}}oclc_correctly_rounded_sqrt_on.bc"{{.*}}oclc_wavefrontsize64_on.bc"{{.*}}oclc_isa_version_803.bc"
+// CHECK-LIB-DEVICE-NOGPULIB-NOT: "-cc1" {{.*}}ocml.bc"{{.*}}oclc_daz_opt_on.bc"{{.*}}oclc_unsafe_math_off.bc"{{.*}}oclc_finite_only_off.bc"{{.*}}oclc_correctly_rounded_sqrt_on.bc"{{.*}}oclc_wavefrontsize64_on.bc"{{.*}}oclc_isa_version_803.bc"
 
 // RUN: %clang -### -target x86_64-pc-linux-gnu -fopenmp --offload-arch=gfx90a:sramecc-:xnack+ \
 // RUN:   -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-TARGET-ID
diff --git a/clang/test/Driver/arm-fpu-selection.s b/clang/test/Driver/arm-fpu-selection.s
new file mode 100644
index 000000000000..6af374d04905
--- /dev/null
+++ b/clang/test/Driver/arm-fpu-selection.s
@@ -0,0 +1,36 @@
+// REQUIRES: arm-registered-target
+/// Ensures that when targeting an ARM target with an Asm file, clang
+/// collects the features from the FPU. This is critical in the
+/// activation of NEON for supported targets. The Cortex-R52 will be
+/// used and tested for VFP and NEON Support
+
+// RUN: %clang -target arm-none-eabi -mcpu=cortex-r52 -c %s -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-STDERR %s --allow-empty
+// RUN: %clang -target arm-none-eabi -mcpu=cortex-r52 -c %s -o /dev/null -### 2>&1 | FileCheck --check-prefix=CHECK-TARGET-FEATURES %s
+
+/// Check that no errors or warnings are present when assembling using cc1as.
+// CHECK-STDERR-NOT: error:
+// CHECK-STDERR-NOT: warning:
+
+/// Check that NEON and VFPV5 have been activated when using Cortex-R52 when using cc1as
+// CHECK-TARGET-FEATURES: "-target-feature" "+vfp2sp"
+// CHECK-TARGET-FEATURES: "-target-feature" "+vfp3"
+// CHECK-TARGET-FEATURES: "-target-feature" "+fp-armv8"
+// CHECK-TARGET-FEATURES: "-target-feature" "+fp-armv8d16"
+// CHECK-TARGET-FEATURES: "-target-feature" "+fp-armv8d16sp"
+// CHECK-TARGET-FEATURES: "-target-feature" "+fp-armv8sp"
+// CHECK-TARGET-FEATURES: "-target-feature" "+neon"
+
+  vadd.f32 s0, s1, s2
+  vadd.f64 d0, d1, d2
+  vcvt.u32.f32 s0, s0, #1
+  vcvt.u32.f64 d0, d0, #1
+  vcvtb.f32.f16 s0, s1
+  vcvtb.f64.f16 d0, s1
+  vfma.f32 s0, s1, s2
+  vfma.f64 d0, d1, d2
+  vcvta.u32.f32 s0, s1
+  vcvta.u32.f64 s0, d1
+  vadd.f32 q0, q1, q2
+  vcvt.f32.f16 q0, d1
+  vfma.f32 q0, q1, q2
+  vcvta.u32.f32 q0, q1
diff --git a/clang/test/Driver/armv7-default-neon.s b/clang/test/Driver/armv7-default-neon.s
new file mode 100644
index 000000000000..2015f0bc429d
--- /dev/null
+++ b/clang/test/Driver/armv7-default-neon.s
@@ -0,0 +1,16 @@
+/// Ensure that we can assemble NEON by just specifying an armv7
+/// Apple or Windows target.
+
+// REQUIRES: arm-registered-target
+// RUN: %clang -c -target armv7-apple-darwin -o /dev/null %s 2>&1 | FileCheck --check-prefix=CHECK-STDERR %s --allow-empty
+// RUN: %clang -c -target armv7-apple-darwin -o /dev/null %s -### 2>&1 | FileCheck --check-prefix=CHECK-TARGET-FEATURES %s
+// RUN: %clang -c -target armv7-windows -o /dev/null %s 2>&1 | FileCheck --check-prefix=CHECK-STDERR %s --allow-empty
+// RUN: %clang -c -target armv7-windows -o /dev/null %s -### 2>&1 | FileCheck --check-prefix=CHECK-TARGET-FEATURES %s
+
+/// Check that no errors or warnings are present when assembling using cc1as.
+// CHECK-STDERR-NOT: error:
+// CHECK-STDERR-NOT: warning:
+
+// CHECK-TARGET-FEATURES: "-target-feature" "+neon"
+
+vadd.i32 q0, q0, q0
diff --git a/clang/test/Driver/armv7s-default-vfpv4.s b/clang/test/Driver/armv7s-default-vfpv4.s
new file mode 100644
index 000000000000..3e16503e6316
--- /dev/null
+++ b/clang/test/Driver/armv7s-default-vfpv4.s
@@ -0,0 +1,13 @@
+/// Ensure that we can assemble VFPv4 by just specifying an armv7s target.
+
+// REQUIRES: arm-registered-target
+// RUN: %clang -c -target armv7s-apple-darwin -o /dev/null %s 2>&1 | FileCheck --check-prefix=CHECK-STDERR %s --allow-empty
+// RUN: %clang -c -target armv7s-apple-darwin -o /dev/null %s -### 2>&1 | FileCheck --check-prefix=CHECK-TARGET-FEATURES %s
+
+/// Check that no errors or warnings are present when assembling using cc1as.
+// CHECK-STDERR-NOT: error:
+// CHECK-STDERR-NOT: warning:
+
+// CHECK-TARGET-FEATURES: "-target-feature" "+vfp4"
+
+vfma.f32 q1, q2, q3
diff --git a/clang/test/Driver/armv8.1m.main.s b/clang/test/Driver/armv8.1m.main.s
index 8fc94cf772fa..a660e56c2d1e 100644
--- a/clang/test/Driver/armv8.1m.main.s
+++ b/clang/test/Driver/armv8.1m.main.s
@@ -8,21 +8,21 @@
 # RUN: not %clang -c -target arm-none-none-eabi -march=armv8.1-m.main+fp -o /dev/null %s 2>%t
 # RUN:      FileCheck --check-prefix=ERROR-V81M_FP < %t %s
 # RUN: not %clang -c -target arm-none-none-eabi -march=armv8.1-m.main+nofp -o /dev/null %s 2>%t
-# RUN:      FileCheck --check-prefix=ERROR-V81M_FP < %t %s
+# RUN:      FileCheck --check-prefix=ERROR-V81M_NOFP < %t %s
 # RUN: not %clang -c -target arm-none-none-eabi -march=armv8.1-m.main+fp.dp -o /dev/null %s 2>%t
 # RUN:      FileCheck --check-prefix=ERROR-V81M_FPDP < %t %s
 # RUN: not %clang -c -target arm-none-none-eabi -march=armv8.1-m.main+nofp.dp -o /dev/null %s 2>%t
-# RUN:      FileCheck --check-prefix=ERROR-V81M_FPDP < %t %s
+# RUN:      FileCheck --check-prefix=ERROR-V81M_NOFPDP < %t %s
 # RUN: not %clang -c -target arm-none-none-eabi -march=armv8.1-m.main+mve -o /dev/null %s 2>%t
 # RUN:      FileCheck --check-prefix=ERROR-V81M_MVE < %t %s
 # RUN: not %clang -c -target arm-none-none-eabi -march=armv8.1-m.main+nomve -o /dev/null %s 2>%t
-# RUN:      FileCheck --check-prefix=ERROR-V81M_MVE < %t %s
+# RUN:      FileCheck --check-prefix=ERROR-V81M_NOMVE < %t %s
 # RUN: not %clang -c -target arm-none-none-eabi -march=armv8.1-m.main+mve+fp -o /dev/null %s 2>%t
 # RUN:      FileCheck --check-prefix=ERROR-V81M_MVE_FP < %t %s
 # RUN: not %clang -c -target arm-none-none-eabi -march=armv8.1-m.main+mve.fp -o /dev/null %s 2>%t
 # RUN:      FileCheck --check-prefix=ERROR-V81M_MVEFP < %t %s
 # RUN: not %clang -c -target arm-none-none-eabi -march=armv8.1-m.main+nomve.fp -o /dev/null %s 2>%t
-# RUN:      FileCheck --check-prefix=ERROR-V81M_MVEFP < %t %s
+# RUN:      FileCheck --check-prefix=ERROR-V81M_NOMVEFP < %t %s
 
 .syntax unified
 .thumb
@@ -35,28 +35,41 @@ qadd     r0, r1, r2
 # ERROR-V8M: :[[@LINE-1]]:1: error
 # ERROR-V81M: :[[@LINE-2]]:1: error
 # ERROR-V81M_FP: :[[@LINE-3]]:1: error
-# ERROR-V81M_FPDP: :[[@LINE-4]]:1: error
+# ERROR-V81M_NOFP: :[[@LINE-4]]:1: error
+# ERROR-V81M_FPDP: :[[@LINE-5]]:1: error
+# ERROR-V81M_NOFPDP: :[[@LINE-6]]:1: error
+# ERROR-V81M_NOMVE: :[[@LINE-7]]:1: error
+# ERROR-V81M_NOMVEFP: :[[@LINE-8]]:1: error
 
 vadd.f16 s0, s1, s2
 # ERROR-V8M: :[[@LINE-1]]:1: error
-# ERROR-V81M: :[[@LINE-2]]:1: error
-# ERROR-V81M_DSP: :[[@LINE-3]]:1: error
-# ERROR-V81M_MVE: :[[@LINE-4]]:1: error
+# ERROR-V81M_NOFP: :[[@LINE-2]]:1: error
 
 vabs.f32 s0, s1
-# ERROR-V8M: :[[@LINE-1]]:1: error
-# ERROR-V81M: :[[@LINE-2]]:1: error
-# ERROR-V81M_DSP: :[[@LINE-3]]:1: error
-# ERROR-V81M_MVE: :[[@LINE-4]]:1: error
+# ERROR-V81M_NOFP: :[[@LINE-1]]:1: error
 
-vcmp.f64 d0,d1
+vabs.s32 q0, q1
 # ERROR-V8M: :[[@LINE-1]]:1: error
 # ERROR-V81M: :[[@LINE-2]]:1: error
 # ERROR-V81M_DSP: :[[@LINE-3]]:1: error
 # ERROR-V81M_FP: :[[@LINE-4]]:1: error
-# ERROR-V81M_MVE: :[[@LINE-5]]:1: error
-# ERROR-V81M_MVE_FP: :[[@LINE-6]]:1: error
-# ERROR-V81M_MVEFP: :[[@LINE-7]]:1: error
+# ERROR-V81M_NOFP: :[[@LINE-5]]:1: error
+# ERROR-V81M_FPDP: :[[@LINE-6]]:1: error
+# ERROR-V81M_NOFPDP: :[[@LINE-7]]:1: error
+# ERROR-V81M_NOMVE: :[[@LINE-8]]:1: error
+# ERROR-V81M_NOMVEFP: :[[@LINE-9]]:1: error
+
+vcmp.f64 d0,d1
+# ERROR-V81M: :[[@LINE-1]]:1: error
+# ERROR-V81M_DSP: :[[@LINE-2]]:1: error
+# ERROR-V81M_FP: :[[@LINE-3]]:1: error
+# ERROR-V81M_NOFP: :[[@LINE-4]]:1: error
+# ERROR-V81M_NOFPDP: :[[@LINE-5]]:1: error
+# ERROR-V81M_MVE: :[[@LINE-6]]:1: error
+# ERROR-V81M_NOMVE: :[[@LINE-7]]:1: error
+# ERROR-V81M_MVE_FP: :[[@LINE-8]]:1: error
+# ERROR-V81M_MVEFP: :[[@LINE-9]]:1: error
+# ERROR-V81M_NOMVEFP: :[[@LINE-10]]:1: error
 
 asrl r0, r1, r2
 # ERROR-V8M: :[[@LINE-1]]:1: error
@@ -64,6 +77,9 @@ asrl r0, r1, r2
 # ERROR-V81M_DSP: :[[@LINE-3]]:1: error
 # ERROR-V81M_FP: :[[@LINE-4]]:1: error
 # ERROR-V81M_FPDP: :[[@LINE-5]]:1: error
+# ERROR-V81M_NOFPDP: :[[@LINE-6]]:1: error
+# ERROR-V81M_NOMVE: :[[@LINE-7]]:1: error
+# ERROR-V81M_NOMVEFP: :[[@LINE-8]]:1: error
 
 vcadd.i8 q0, q1, q2, #90
 # ERROR-V8M: :[[@LINE-1]]:1: error
@@ -71,3 +87,6 @@ vcadd.i8 q0, q1, q2, #90
 # ERROR-V81M_DSP: :[[@LINE-3]]:1: error
 # ERROR-V81M_FP: :[[@LINE-4]]:1: error
 # ERROR-V81M_FPDP: :[[@LINE-5]]:1: error
+# ERROR-V81M_NOFPDP: :[[@LINE-6]]:1: error
+# ERROR-V81M_NOMVE: :[[@LINE-7]]:1: error
+# ERROR-V81M_NOMVEFP: :[[@LINE-8]]:1: error
diff --git a/clang/test/Driver/hip-device-libs.hip b/clang/test/Driver/hip-device-libs.hip
index b3829114138c..effce40d67eb 100644
--- a/clang/test/Driver/hip-device-libs.hip
+++ b/clang/test/Driver/hip-device-libs.hip
@@ -6,7 +6,7 @@
 // RUN:  --cuda-gpu-arch=gfx803 \
 // RUN:  --rocm-path=%S/Inputs/rocm   \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,ROCMDIR
+// RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,FLUSHD,ROCMDIR
 
 
 // Test subtarget with flushing off by ddefault.
@@ -14,7 +14,7 @@
 // RUN:  --cuda-gpu-arch=gfx900 \
 // RUN:  --rocm-path=%S/Inputs/rocm \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,ROCMDIR
+// RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,NOFLUSHD,ROCMDIR
 
 
 // Test explicit flag, opposite of target default.
@@ -23,7 +23,7 @@
 // RUN:   -fgpu-flush-denormals-to-zero \
 // RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,ROCMDIR
+// RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,FLUSHD,ROCMDIR
 
 
 // Test explicit flag, opposite of target default.
@@ -32,7 +32,7 @@
 // RUN:   -fno-gpu-flush-denormals-to-zero \
 // RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,ROCMDIR
+// RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,NOFLUSHD,ROCMDIR
 
 
 // Test explicit flag, same as target default.
@@ -41,7 +41,7 @@
 // RUN:   -fno-gpu-flush-denormals-to-zero \
 // RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,ROCMDIR
+// RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,NOFLUSHD,ROCMDIR
 
 
 // Test explicit flag, same as target default.
@@ -50,7 +50,7 @@
 // RUN:   -fgpu-flush-denormals-to-zero \
 // RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,ROCMDIR
+// RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,FLUSHD,ROCMDIR
 
 
 // Test last flag wins, not flushing
@@ -59,7 +59,7 @@
 // RUN:   -fgpu-flush-denormals-to-zero -fno-gpu-flush-denormals-to-zero \
 // RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,ROCMDIR
+// RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,NOFLUSHD,ROCMDIR
 
 
 // RUN: %clang -### --target=x86_64-linux-gnu \
@@ -67,7 +67,7 @@
 // RUN:   -fgpu-flush-denormals-to-zero -fno-gpu-flush-denormals-to-zero \
 // RUN:   --rocm-path=%S/Inputs/rocm   \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,ROCMDIR
+// RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,NOFLUSHD,ROCMDIR
 
 
 // RUN: %clang -### --target=x86_64-linux-gnu \
@@ -75,7 +75,7 @@
 // RUN:   -fno-gpu-flush-denormals-to-zero -fgpu-flush-denormals-to-zero \
 // RUN:   --rocm-path=%S/Inputs/rocm   \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,ROCMDIR
+// RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,FLUSHD,ROCMDIR
 
 
 // RUN: %clang -### --target=x86_64-linux-gnu \
@@ -83,21 +83,21 @@
 // RUN:   -fno-gpu-flush-denormals-to-zero -fgpu-flush-denormals-to-zero \
 // RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,ROCMDIR
+// RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,FLUSHD,ROCMDIR
 
 // Test finding device lib in resource dir
 // RUN: %clang -### --target=x86_64-linux-gnu \
 // RUN:   --offload-arch=gfx803 -nogpuinc \
 // RUN:   -resource-dir=%S/Inputs/rocm_resource_dir \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,RESDIR
+// RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,FLUSHD,RESDIR
 
 // Test --hip-device-lib-path flag
 // RUN: %clang -### --target=x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx803 -nogpuinc \
 // RUN:   --hip-device-lib-path=%S/Inputs/rocm/amdgcn/bitcode   \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,ROCMDIR
+// RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,FLUSHD,ROCMDIR
 
 // Test --hip-device-lib-path wins over -resource-dir
 // RUN: %clang -### --target=x86_64-linux-gnu \
@@ -105,7 +105,7 @@
 // RUN:   --hip-device-lib-path=%S/Inputs/rocm/amdgcn/bitcode   \
 // RUN:   -resource-dir=%S/Inputs/rocm_resource_dir \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,ROCMDIR
+// RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,FLUSHD,ROCMDIR
 
 // Test environment variable HIP_DEVICE_LIB_PATH
 // RUN: env HIP_DEVICE_LIB_PATH=%S/Inputs/rocm/amdgcn/bitcode \
@@ -213,6 +213,9 @@
 
 // ALL-SAME: "-mlink-builtin-bitcode" "[[DEVICELIB_DIR]]ockl.bc"
 
+// FLUSHD-SAME: "-mlink-builtin-bitcode" "[[DEVICELIB_DIR]]oclc_daz_opt_on.bc"
+// NOFLUSHD-SAME: "-mlink-builtin-bitcode" "[[DEVICELIB_DIR]]oclc_daz_opt_off.bc"
+
 // ALL-SAME: "-mlink-builtin-bitcode" "[[DEVICELIB_DIR]]oclc_unsafe_math_off.bc"
 // ALL-SAME: "-mlink-builtin-bitcode" "[[DEVICELIB_DIR]]oclc_finite_only_off.bc"
 // ALL-SAME: "-mlink-builtin-bitcode" "[[DEVICELIB_DIR]]oclc_correctly_rounded_sqrt_on.bc"
@@ -220,19 +223,23 @@
 // ALL-SAME: "-mlink-builtin-bitcode" "[[DEVICELIB_DIR]]oclc_isa_version_{{[0-9]+}}.bc"
 // INST-SAME: "-mlink-builtin-bitcode" "{{.*}}instrument.bc"
 
-// FAST: "-mlink-builtin-bitcode" "{{.*}}oclc_unsafe_math_on.bc"
+// FAST: "-mlink-builtin-bitcode" "{{.*}}oclc_daz_opt_off.bc"
+// FAST-SAME: "-mlink-builtin-bitcode" "{{.*}}oclc_unsafe_math_on.bc"
 // FAST-SAME: "-mlink-builtin-bitcode" "{{.*}}oclc_finite_only_on.bc"
 // FAST-SAME: "-mlink-builtin-bitcode" "{{.*}}oclc_correctly_rounded_sqrt_on.bc"
 
-// FINITE: "-mlink-builtin-bitcode" "{{.*}}oclc_unsafe_math_off.bc"
+// FINITE: "-mlink-builtin-bitcode" "{{.*}}oclc_daz_opt_off.bc"
+// FINITE-SAME: "-mlink-builtin-bitcode" "{{.*}}oclc_unsafe_math_off.bc"
 // FINITE-SAME: "-mlink-builtin-bitcode" "{{.*}}oclc_finite_only_on.bc"
 // FINITE-SAME: "-mlink-builtin-bitcode" "{{.*}}oclc_correctly_rounded_sqrt_on.bc"
 
-// UNSAFE: "-mlink-builtin-bitcode" "{{.*}}oclc_unsafe_math_on.bc"
+// UNSAFE: "-mlink-builtin-bitcode" "{{.*}}oclc_daz_opt_off.bc"
+// UNSAFE-SAME: "-mlink-builtin-bitcode" "{{.*}}oclc_unsafe_math_on.bc"
 // UNSAFE-SAME: "-mlink-builtin-bitcode" "{{.*}}oclc_finite_only_off.bc"
 // UNSAFE-SAME: "-mlink-builtin-bitcode" "{{.*}}oclc_correctly_rounded_sqrt_on.bc"
 
-// DIVSQRT: "-mlink-builtin-bitcode" "{{.*}}oclc_unsafe_math_off.bc"
+// DIVSQRT: "-mlink-builtin-bitcode" "{{.*}}oclc_daz_opt_off.bc"
+// DIVSQRT-SAME: "-mlink-builtin-bitcode" "{{.*}}oclc_unsafe_math_off.bc"
 // DIVSQRT-SAME: "-mlink-builtin-bitcode" "{{.*}}oclc_finite_only_off.bc"
 // DIVSQRT-SAME: "-mlink-builtin-bitcode" "{{.*}}oclc_correctly_rounded_sqrt_off.bc"
 
diff --git a/clang/test/Driver/rocm-device-libs.cl b/clang/test/Driver/rocm-device-libs.cl
index 7aee10bf1556..f9766e6fa4d9 100644
--- a/clang/test/Driver/rocm-device-libs.cl
+++ b/clang/test/Driver/rocm-device-libs.cl
@@ -6,7 +6,7 @@
 // RUN:   -x cl -mcpu=gfx900 \
 // RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %s \
-// RUN: 2>&1 | FileCheck  --check-prefixes=COMMON,COMMON-DEFAULT,GFX900,WAVE64 %s
+// RUN: 2>&1 | FileCheck  --check-prefixes=COMMON,COMMON-DEFAULT,GFX900-DEFAULT,GFX900,WAVE64 %s
 
 
 
@@ -15,7 +15,7 @@
 // RUN:   -x cl -mcpu=gfx803 \
 // RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %s \
-// RUN: 2>&1 | FileCheck  --check-prefixes=COMMON,COMMON-DEFAULT,GFX803,WAVE64 %s
+// RUN: 2>&1 | FileCheck  --check-prefixes=COMMON,COMMON-DEFAULT,GFX803-DEFAULT,GFX803,WAVE64 %s
 
 
 
@@ -24,7 +24,7 @@
 // RUN:   -x cl -mcpu=fiji \
 // RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %s \
-// RUN: 2>&1 | FileCheck  --check-prefixes=COMMON,COMMON-DEFAULT,GFX803,WAVE64 %s
+// RUN: 2>&1 | FileCheck  --check-prefixes=COMMON,COMMON-DEFAULT,GFX803-DEFAULT,GFX803,WAVE64 %s
 
 
 
@@ -33,7 +33,7 @@
 // RUN:   -cl-denorms-are-zero \
 // RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %s \
-// RUN: 2>&1 | FileCheck  --check-prefixes=COMMON,GFX900,WAVE64 %s
+// RUN: 2>&1 | FileCheck  --check-prefixes=COMMON,COMMON-DAZ,GFX900,WAVE64 %s
 
 
 // RUN: %clang -### -target amdgcn-amd-amdhsa \
@@ -41,7 +41,7 @@
 // RUN:   -cl-denorms-are-zero \
 // RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %s \
-// RUN: 2>&1 | FileCheck  --check-prefixes=COMMON,GFX803,WAVE64 %s
+// RUN: 2>&1 | FileCheck  --check-prefixes=COMMON,COMMON-DAZ,GFX803,WAVE64 %s
 
 
 
@@ -124,13 +124,13 @@
 // RUN:   -x cl -mcpu=gfx900 \
 // RUN:   --hip-device-lib-path=%S/Inputs/rocm/amdgcn/bitcode \
 // RUN:   %S/opencl.cl \
-// RUN: 2>&1 | FileCheck  --check-prefixes=COMMON,COMMON-DEFAULT,GFX900,WAVE64 %s
+// RUN: 2>&1 | FileCheck  --check-prefixes=COMMON,COMMON-DEFAULT,GFX900-DEFAULT,GFX900,WAVE64 %s
 
 // Test environment variable HIP_DEVICE_LIB_PATH
 // RUN: env HIP_DEVICE_LIB_PATH=%S/Inputs/rocm/amdgcn/bitcode %clang -### -target amdgcn-amd-amdhsa \
 // RUN:   -x cl -mcpu=gfx900 \
 // RUN:   %S/opencl.cl \
-// RUN: 2>&1 | FileCheck  --check-prefixes=COMMON,COMMON-DEFAULT,GFX900,WAVE64 %s
+// RUN: 2>&1 | FileCheck  --check-prefixes=COMMON,COMMON-DEFAULT,GFX900-DEFAULT,GFX900,WAVE64 %s
 
 // RUN: %clang -### -target amdgcn-amd-amdhsa \
 // RUN:   -x cl -mcpu=gfx908:xnack+ -fsanitize=address \
@@ -150,6 +150,11 @@
 // COMMON-SAME: "-mlink-builtin-bitcode" "{{.*}}/amdgcn/bitcode/ocml.bc"
 // COMMON-SAME: "-mlink-builtin-bitcode" "{{.*}}/amdgcn/bitcode/ockl.bc"
 
+// GFX900-DEFAULT-SAME: "-mlink-builtin-bitcode" "{{.*}}/amdgcn/bitcode/oclc_daz_opt_off.bc"
+// GFX803-DEFAULT-SAME: "-mlink-builtin-bitcode" "{{.*}}/amdgcn/bitcode/oclc_daz_opt_on.bc"
+// GFX700-DEFAULT-SAME: "-mlink-builtin-bitcode" "{{.*}}/amdgcn/bitcode/oclc_daz_opt_on.bc"
+// COMMON-DAZ-SAME: "-mlink-builtin-bitcode" "{{.*}}/amdgcn/bitcode/oclc_daz_opt_on.bc"
+
 
 // COMMON-DEFAULT-SAME: "-mlink-builtin-bitcode" "{{.*}}/amdgcn/bitcode/oclc_unsafe_math_off.bc"
 // COMMON-DEFAULT-SAME: "-mlink-builtin-bitcode" "{{.*}}/amdgcn/bitcode/oclc_finite_only_off.bc"
diff --git a/clang/test/Modules/relocatable-modules.cpp b/clang/test/Modules/relocatable-modules.cpp
new file mode 100644
index 000000000000..c8d1e6d45566
--- /dev/null
+++ b/clang/test/Modules/relocatable-modules.cpp
@@ -0,0 +1,54 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+// RUN: cd %t
+
+// RUN: %clang_cc1 -std=c++20 -emit-header-unit -xc++-user-header hu-01.h \
+// RUN:   -fmodule-name=hu-01 -o hu-01.pcm
+
+// RUN: %clang_cc1 -std=c++20 -emit-header-unit -xc++-user-header hu-02.h \
+// RUN:  -Wno-experimental-header-units -fmodule-file=hu-01.pcm -o hu-02-abs.pcm
+
+// RUN: %clang_cc1 -std=c++20 -emit-header-unit -xc++-user-header hu-02.h \
+// RUN:  -Wno-experimental-header-units -fmodule-file=hu-01.pcm -o hu-02-rel.pcm \
+// RUN:  -fmodule-file-home-is-cwd
+
+// RUN: %clang -module-file-info hu-02-abs.pcm | FileCheck %s --check-prefix=IMPORT-ABS -DPREFIX=%t
+// IMPORT-ABS: Imports module 'hu-01': [[PREFIX]]{{/|\\}}hu-01.pcm
+
+// RUN: %clang -module-file-info hu-02-rel.pcm | FileCheck %s --check-prefix=IMPORT-REL
+// IMPORT-REL: Imports module 'hu-01': hu-01.pcm
+
+// RUN: llvm-bcanalyzer --dump --disable-histogram %t/hu-02-abs.pcm \
+// RUN:   | FileCheck %s --check-prefix=INPUT-ABS -DPREFIX=%t
+// INPUT-ABS: <INPUT_FILE {{.*}}/> blob data = '[[PREFIX]]{{/|\\}}hu-02.h'
+
+// RUN: llvm-bcanalyzer --dump --disable-histogram %t/hu-02-rel.pcm \
+// RUN:   | FileCheck %s --check-prefix=INPUT-REL
+// INPUT-REL: <INPUT_FILE {{.*}}/> blob data = 'hu-02.h'
+
+//--- hu-01.h
+inline void f() {}
+
+//--- hu-02.h
+import "hu-01.h";
+
+inline void g() {
+  f();
+}
+
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/a.cppm -o %t/a-abs.pcm
+
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/a.cppm -o %t/a-rel.pcm \
+// RUN:   -fmodule-file-home-is-cwd
+
+// RUN: llvm-bcanalyzer --dump --disable-histogram %t/a-abs.pcm \
+// RUN:   | FileCheck %s --check-prefix=M-INPUT-ABS -DPREFIX=%t
+// M-INPUT-ABS: <INPUT_FILE {{.*}}/> blob data = '[[PREFIX]]{{/|\\}}a.cppm'
+
+// RUN: llvm-bcanalyzer --dump --disable-histogram %t/a-rel.pcm \
+// RUN:   | FileCheck %s --check-prefix=M-INPUT-REL
+// M-INPUT-REL: <INPUT_FILE {{.*}}/> blob data = 'a.cppm'
+
+//--- a.cppm
+export module a;
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index bf3260c6216d..b62d49e17c83 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -20207,6 +20207,16 @@ TEST_F(FormatTest, AlignConsecutiveDeclarations) {
                "double b();",
                AlignmentLeft);
 
+  auto Style = AlignmentLeft;
+  Style.AlignConsecutiveDeclarations.AlignFunctionPointers = true;
+  Style.BinPackParameters = FormatStyle::BPPS_OnePerLine;
+  verifyFormat("int function_name(const wchar_t*  title,\n"
+               "                  int             x          = 0,\n"
+               "                  long            extraStyle = 0,\n"
+               "                  bool            readOnly   = false,\n"
+               "                  FancyClassType* module     = nullptr);",
+               Style);
+
   // PAS_Middle
   FormatStyle AlignmentMiddle = Alignment;
   AlignmentMiddle.PointerAlignment = FormatStyle::PAS_Middle;
@@ -20438,7 +20448,7 @@ TEST_F(FormatTest, AlignConsecutiveDeclarations) {
                Alignment);
 
   // See PR37175
-  FormatStyle Style = getMozillaStyle();
+  Style = getMozillaStyle();
   Style.AlignConsecutiveDeclarations.Enabled = true;
   verifyFormat("DECOR1 /**/ int8_t /**/ DECOR2 /**/\n"
                "foo(int a);",
@@ -23712,6 +23722,7 @@ TEST_F(FormatTest, FormatsLambdas) {
   verifyFormat("function([]() { return b; })", MergeInline);
   verifyFormat("function([]() { return b; }, a)", MergeInline);
   verifyFormat("function(a, []() { return b; })", MergeInline);
+  verifyFormat("auto guard = foo{[&] { exit_status = true; }};", MergeInline);
 
   // Check option "BraceWrapping.BeforeLambdaBody" and different state of
   // AllowShortLambdasOnASingleLine
diff --git a/clang/utils/TableGen/ClangOpcodesEmitter.cpp b/clang/utils/TableGen/ClangOpcodesEmitter.cpp
index 64534a50877e..5d6d90994cf3 100644
--- a/clang/utils/TableGen/ClangOpcodesEmitter.cpp
+++ b/clang/utils/TableGen/ClangOpcodesEmitter.cpp
@@ -171,16 +171,12 @@ void ClangOpcodesEmitter::EmitDisasm(raw_ostream &OS, StringRef N,
   OS << "#ifdef GET_DISASM\n";
   Enumerate(R, N, [R, &OS](ArrayRef<const Record *>, const Twine &ID) {
     OS << "case OP_" << ID << ":\n";
-    OS << "  PrintName(\"" << ID << "\");\n";
-    OS << "  OS << \"\\t\"";
+    OS << "  Text.Op = PrintName(\"" << ID << "\");\n";
+    for (const auto *Arg : R->getValueAsListOfDefs("Args"))
+      OS << "  Text.Args.push_back(printArg<" << Arg->getValueAsString("Name")
+         << ">(P, PC));\n";
 
-    for (const auto *Arg : R->getValueAsListOfDefs("Args")) {
-      OS << " << ReadArg<" << Arg->getValueAsString("Name") << ">(P, PC)";
-      OS << " << \" \"";
-    }
-
-    OS << " << \"\\n\";\n";
-    OS << "  continue;\n";
+    OS << "  break;\n";
   });
   OS << "#endif\n";
 }
diff --git a/clang/utils/TableGen/RISCVVEmitter.cpp b/clang/utils/TableGen/RISCVVEmitter.cpp
index 02e5e51f6d09..8d94ec3d920d 100644
--- a/clang/utils/TableGen/RISCVVEmitter.cpp
+++ b/clang/utils/TableGen/RISCVVEmitter.cpp
@@ -45,7 +45,7 @@ struct SemaRecord {
   unsigned Log2LMULMask;
 
   // Required extensions for this intrinsic.
-  uint32_t RequiredExtensions[(RVV_REQ_NUM + 31) / 32];
+  RequiredExtensionBits RequiredExtensions;
 
   // Prototype for this intrinsic.
   SmallVector<PrototypeDescriptor> Prototype;
@@ -769,7 +769,6 @@ void RVVEmitter::createRVVIntrinsics(
 
     SR.Log2LMULMask = Log2LMULMask;
 
-    memset(SR.RequiredExtensions, 0, sizeof(SR.RequiredExtensions));
     for (auto RequiredFeature : RequiredFeatures) {
       unsigned RequireExt =
           StringSwitch<RVVRequire>(RequiredFeature)
@@ -793,7 +792,7 @@ void RVVEmitter::createRVVIntrinsics(
               .Case("Zvfbfmin", RVV_REQ_Zvfbfmin)
               .Case("Zvfh", RVV_REQ_Zvfh)
               .Case("Experimental", RVV_REQ_Experimental);
-      SR.RequiredExtensions[RequireExt / 32] |= 1U << (RequireExt % 32);
+      SR.RequiredExtensions.set(RequireExt);
     }
 
     SR.NF = NF;
@@ -837,8 +836,7 @@ void RVVEmitter::createRVVIntrinsicRecords(std::vector<RVVIntrinsicRecord> &Out,
     R.PrototypeLength = SR.Prototype.size();
     R.SuffixLength = SR.Suffix.size();
     R.OverloadedSuffixSize = SR.OverloadedSuffix.size();
-    memcpy(R.RequiredExtensions, SR.RequiredExtensions,
-           sizeof(R.RequiredExtensions));
+    R.RequiredExtensions = SR.RequiredExtensions;
     R.TypeRangeMask = SR.TypeRangeMask;
     R.Log2LMULMask = SR.Log2LMULMask;
     R.NF = SR.NF;
diff --git a/flang/test/Driver/flang-ld-powerpc.f90 b/flang/test/Driver/flang-ld-powerpc.f90
index 9a6ee453a22e..a58cb1629cad 100644
--- a/flang/test/Driver/flang-ld-powerpc.f90
+++ b/flang/test/Driver/flang-ld-powerpc.f90
@@ -7,35 +7,100 @@
 !! LLVM_ENABLE_PER_TARGET_RUNTIME_DIR=ON, use 
 !! resource_dir_with_per_target_subdir as inputs.
 
-! Check powerpc64-ibm-aix 64-bit linking to static flang-rt
-! RUN: %flang %s -### 2>&1 \
+! Check powerpc64-ibm-aix 64-bit linking to static flang-rt by default
+! RUN: %flang -Werror %s -### 2>&1 \
 ! RUN:        --target=powerpc64-ibm-aix \
 ! RUN:        -resource-dir=%S/../../../clang/test/Driver/Inputs/resource_dir_with_per_target_subdir \
-! RUN:   | FileCheck %s --check-prefix=AIX64-LD-PER-TARGET
-
-! AIX64-LD-PER-TARGET-NOT: warning:
-! AIX64-LD-PER-TARGET:     "-fc1" "-triple" "powerpc64-ibm-aix"
-! AIX64-LD-PER-TARGET-SAME:     "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
-! AIX64-LD-PER-TARGET:     "{{.*}}ld{{(.exe)?}}"
-! AIX64-LD-PER-TARGET-NOT: "-bnso"
-! AIX64-LD-PER-TARGET-SAME:     "-b64"
-! AIX64-LD-PER-TARGET-SAME:     "-bpT:0x100000000" "-bpD:0x110000000"
-! AIX64-LD-PER-TARGET-SAME:     "-lc"
-! AIX64-LD-PER-TARGET-SAME:     "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}powerpc64-ibm-aix{{/|\\\\}}libflang_rt.runtime.a"
-! AIX64-LD-PER-TARGET-SAME:     "-lm"
-! AIX64-LD-PER-TARGET-SAME:     "-lpthread"
-
-! Check powerpc64le-unknown-linux-gnu 64-bit linking to static flang-rt
-! RUN: %flang %s -### 2>&1 \
+! RUN:   | FileCheck %s --check-prefix=AIX64-LD-PER-TARGET-DEFAULT
+
+! AIX64-LD-PER-TARGET-DEFAULT:     "-fc1" "-triple" "powerpc64-ibm-aix"
+! AIX64-LD-PER-TARGET-DEFAULT-SAME:     "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
+! AIX64-LD-PER-TARGET-DEFAULT:     "{{.*}}ld{{(.exe)?}}"
+! AIX64-LD-PER-TARGET-DEFAULT-NOT: "-bnso"
+! AIX64-LD-PER-TARGET-DEFAULT-SAME:     "-b64"
+! AIX64-LD-PER-TARGET-DEFAULT-SAME:     "-bpT:0x100000000" "-bpD:0x110000000"
+! AIX64-LD-PER-TARGET-DEFAULT-SAME:     "-lc"
+! AIX64-LD-PER-TARGET-DEFAULT-SAME:     "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}powerpc64-ibm-aix{{/|\\\\}}libflang_rt.runtime.a"
+! AIX64-LD-PER-TARGET-DEFAULT-SAME:     "-lm"
+! AIX64-LD-PER-TARGET-DEFAULT-SAME:     "-lpthread"
+
+
+! Check powerpc64-ibm-aix 64-bit linking to static flang-rt by option 
+! RUN: %flang -static-libflangrt -Werror %s -### 2>&1 \
+! RUN:        --target=powerpc64-ibm-aix \
+! RUN:        -resource-dir=%S/../../../clang/test/Driver/Inputs/resource_dir_with_per_target_subdir \
+! RUN:   | FileCheck %s --check-prefix=AIX64-LD-PER-TARGET-STATIC
+
+! AIX64-LD-PER-TARGET-STATIC:     "-fc1" "-triple" "powerpc64-ibm-aix"
+! AIX64-LD-PER-TARGET-STATIC-SAME:     "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
+! AIX64-LD-PER-TARGET-STATIC:     "{{.*}}ld{{(.exe)?}}"
+! AIX64-LD-PER-TARGET-STATIC-NOT: "-bnso"
+! AIX64-LD-PER-TARGET-STATIC-SAME:     "-b64"
+! AIX64-LD-PER-TARGET-STATIC-SAME:     "-bpT:0x100000000" "-bpD:0x110000000"
+! AIX64-LD-PER-TARGET-STATIC-SAME:     "-lc"
+! AIX64-LD-PER-TARGET-STATIC-SAME:     "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}powerpc64-ibm-aix{{/|\\\\}}libflang_rt.runtime.a"
+! AIX64-LD-PER-TARGET-STATIC-SAME:     "-lm"
+! AIX64-LD-PER-TARGET-STATIC-SAME:     "-lpthread"
+
+
+! Check powerpc64-ibm-aix 64-bit linking to shared flang-rt by option 
+! RUN: %flang -shared-libflangrt -Werror %s -### 2>&1 \
+! RUN:        --target=powerpc64-ibm-aix \
+! RUN:        -resource-dir=%S/../../../clang/test/Driver/Inputs/resource_dir_with_per_target_subdir \
+! RUN:   | FileCheck %s --check-prefix=AIX64-LD-PER-TARGET-SHARED
+
+! AIX64-LD-PER-TARGET-SHARED:     "-fc1" "-triple" "powerpc64-ibm-aix"
+! AIX64-LD-PER-TARGET-SHARED-SAME:     "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
+! AIX64-LD-PER-TARGET-SHARED:     "{{.*}}ld{{(.exe)?}}"
+! AIX64-LD-PER-TARGET-SHARED-NOT: "-bnso"
+! AIX64-LD-PER-TARGET-SHARED-SAME:     "-b64"
+! AIX64-LD-PER-TARGET-SHARED-SAME:     "-bpT:0x100000000" "-bpD:0x110000000"
+! AIX64-LD-PER-TARGET-SHARED-SAME:     "-lc"
+! AIX64-LD-PER-TARGET-SHARED-SAME:     "-lflang_rt.runtime"
+! AIX64-LD-PER-TARGET-SHARED-SAME:     "-lm"
+! AIX64-LD-PER-TARGET-SHARED-SAME:     "-lpthread"
+
+
+! Check powerpc64le-unknown-linux-gnu 64-bit linking to shared flang-rt by default
+! RUN: %flang -Werror %s -### 2>&1 \
+! RUN:        --target=powerpc64le-unknown-linux-gnu \
+! RUN:        -resource-dir=%S/../../../clang/test/Driver/Inputs/resource_dir_with_per_target_subdir \
+! RUN:   | FileCheck %s --check-prefixes=LOP64-LD-PER-TARGET-DEFAULT
+
+! LOP64-LD-PER-TARGET-DEFAULT:     "-fc1" "-triple" "powerpc64le-unknown-linux-gnu"
+! LOP64-LD-PER-TARGET-DEFAULT-SAME:     "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
+! LOP64-LD-PER-TARGET-DEFAULT:     "{{.*}}ld{{(.exe)?}}"
+! LOP64-LD-PER-TARGET-DEFAULT-NOT: "-bnso"
+! LOP64-LD-PER-TARGET-DEFAULT-SAME:     "-lflang_rt.runtime"
+! LOP64-LD-PER-TARGET-DEFAULT-SAME:     "-lm"
+! LOP64-LD-PER-TARGET-DEFAULT-SAME:     "-lc"
+
+
+! Check powerpc64le-unknown-linux-gnu 64-bit linking to static flang-rt by option
+! RUN: %flang -static-libflangrt -Werror %s -### 2>&1 \
 ! RUN:        --target=powerpc64le-unknown-linux-gnu \
 ! RUN:        -resource-dir=%S/../../../clang/test/Driver/Inputs/resource_dir_with_per_target_subdir \
-! RUN:   | FileCheck %s --check-prefixes=LOP64-LD-PER-TARGET
-
-! LOP64-LD-PER-TARGET-NOT: warning:
-! LOP64-LD-PER-TARGET:     "-fc1" "-triple" "powerpc64le-unknown-linux-gnu"
-! LOP64-LD-PER-TARGET-SAME:     "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
-! LOP64-LD-PER-TARGET:     "{{.*}}ld{{(.exe)?}}"
-! LOP64-LD-PER-TARGET-NOT: "-bnso"
-! LOP64-LD-PER-TARGET-SAME:     "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}powerpc64le-unknown-linux-gnu{{/|\\\\}}libflang_rt.runtime.a"
-! LOP64-LD-PER-TARGET-SAME:     "-lm"
-! LOP64-LD-PER-TARGET-SAME:     "-lc"
+! RUN:   | FileCheck %s --check-prefixes=LOP64-LD-PER-TARGET-STATIC
+
+! LOP64-LD-PER-TARGET-STATIC:     "-fc1" "-triple" "powerpc64le-unknown-linux-gnu"
+! LOP64-LD-PER-TARGET-STATIC-SAME:     "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
+! LOP64-LD-PER-TARGET-STATIC:     "{{.*}}ld{{(.exe)?}}"
+! LOP64-LD-PER-TARGET-STATIC-NOT: "-bnso"
+! LOP64-LD-PER-TARGET-STATIC-SAME:     "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}powerpc64le-unknown-linux-gnu{{/|\\\\}}libflang_rt.runtime.a"
+! LOP64-LD-PER-TARGET-STATIC-SAME:     "-lm"
+! LOP64-LD-PER-TARGET-STATIC-SAME:     "-lc"
+
+
+! Check powerpc64le-unknown-linux-gnu 64-bit linking to shared flang-rt by option
+! RUN: %flang -shared-libflangrt -Werror %s -### 2>&1 \
+! RUN:        --target=powerpc64le-unknown-linux-gnu \
+! RUN:        -resource-dir=%S/../../../clang/test/Driver/Inputs/resource_dir_with_per_target_subdir \
+! RUN:   | FileCheck %s --check-prefixes=LOP64-LD-PER-TARGET-SHARED
+
+! LOP64-LD-PER-TARGET-SHARED:     "-fc1" "-triple" "powerpc64le-unknown-linux-gnu"
+! LOP64-LD-PER-TARGET-SHARED-SAME:     "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
+! LOP64-LD-PER-TARGET-SHARED:     "{{.*}}ld{{(.exe)?}}"
+! LOP64-LD-PER-TARGET-SHARED-NOT: "-bnso"
+! LOP64-LD-PER-TARGET-SHARED-SAME:     "-lflang_rt.runtime"
+! LOP64-LD-PER-TARGET-SHARED-SAME:     "-lm"
+! LOP64-LD-PER-TARGET-SHARED-SAME:     "-lc"
diff --git a/flang/test/Driver/linker-flags.f90 b/flang/test/Driver/linker-flags.f90
index 20104276d2e4..ad48ea1b9e9b 100644
--- a/flang/test/Driver/linker-flags.f90
+++ b/flang/test/Driver/linker-flags.f90
@@ -2,15 +2,16 @@
 ! invocation. These libraries are added on top of other standard runtime
 ! libraries that the Clang driver will include.
 
-! RUN: %flang -### --target=ppc64le-linux-gnu %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX,UNIX-F128NONE
-! RUN: %flang -### --target=aarch64-apple-darwin %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,DARWIN,DARWIN-F128%f128-lib
+! RUN: %flang -### --target=ppc64le-linux-gnu %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX,UNIX-F128%f128-lib
 ! RUN: %flang -### --target=sparc-sun-solaris2.11 %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX,SOLARIS-F128%f128-lib
 ! RUN: %flang -### --target=x86_64-unknown-freebsd %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,BSD,BSD-F128%f128-lib
 ! RUN: %flang -### --target=x86_64-unknown-netbsd %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,BSD,BSD-F128%f128-lib
 ! RUN: %flang -### --target=x86_64-unknown-openbsd %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,BSD,BSD-F128%f128-lib
 ! RUN: %flang -### --target=x86_64-unknown-dragonfly %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,BSD,BSD-F128%f128-lib
+! RUN: %flang -### --target=aarch64-apple-darwin %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,DARWIN,DARWIN-F128%f128-lib
 ! RUN: %flang -### --target=x86_64-unknown-haiku %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,HAIKU,HAIKU-F128%f128-lib
 ! RUN: %flang -### --target=x86_64-windows-gnu %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,MINGW,MINGW-F128%f128-lib
+
 ! RUN: %flang -### -rtlib=compiler-rt --target=aarch64-linux-gnu %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX,COMPILER-RT
 
 ! NOTE: Clang's driver library, clangDriver, usually adds 'oldnames' on Windows,
@@ -18,6 +19,16 @@
 !       additional dependencies. Make sure its not added.
 ! RUN: %flang -### --target=aarch64-windows-msvc -fuse-ld= %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,MSVC --implicit-check-not oldnames
 
+! RUN: %flang -### --target=ppc64le-linux-gnu -static-libflangrt %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=UNIX-STATIC-FLANGRT
+! RUN: %flang -### --target=sparc-sun-solaris2.11 -static-libflangrt %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=UNIX-STATIC-FLANGRT
+! RUN: %flang -### --target=x86_64-unknown-freebsd -static-libflangrt %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=BSD-STATIC-FLANGRT
+! RUN: %flang -### --target=x86_64-unknown-netbsd -static-libflangrt %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=BSD-STATIC-FLANGRT
+! RUN: %flang -### --target=x86_64-unknown-openbsd -static-libflangrt %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=BSD-STATIC-FLANGRT
+! RUN: %flang -### --target=x86_64-unknown-dragonfly -static-libflangrt %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,BSD-STATIC-FLANGRT
+! RUN: %flang -### --target=aarch64-apple-darwin -static-libflangrt %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=DARWIN-STATIC-FLANGRT
+! RUN: %flang -### --target=x86_64-unknown-haiku -static-libflangrt %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=HAIKU-STATIC-FLANGRT
+! RUN: %flang -### --target=x86_64-windows-gnu -static-libflangrt %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=MINGW-STATIC-FLANGRT
+
 ! Compiler invocation to generate the object file
 ! CHECK-LABEL: {{.*}} "-emit-obj"
 ! CHECK-SAME:  "-o" "[[object_file:.*\.o]]" {{.*}}Inputs/hello.f90
@@ -35,6 +46,7 @@
 ! SOLARIS-F128LIBQUADMATH-SAME: "-lflang_rt.quadmath" "-z" "ignore" "-lquadmath" "-z" "record"
 ! UNIX-SAME: "-lflang_rt.runtime" "-lm"
 ! COMPILER-RT: "{{.*}}{{\\|/}}libclang_rt.builtins.a"
+! UNIX-STATIC-FLANGRT:   "{{.*}}{{\\|/}}libflang_rt.runtime.a"
 
 ! BSD-LABEL:  "{{.*}}ld{{(\.exe)?}}"
 ! BSD-SAME: "[[object_file]]"
@@ -42,24 +54,28 @@
 ! BSD-F128LIBQUADMATH-SAME: "-lflang_rt.quadmath" "--as-needed" "-lquadmath" "--no-as-needed"
 ! BSD-SAME: -lflang_rt.runtime
 ! BSD-SAME: -lexecinfo
+! BSD-STATIC-FLANGRT: "{{.*}}{{\\|/}}libflang_rt.runtime.a"
 
 ! DARWIN-LABEL:  "{{.*}}ld{{(\.exe)?}}"
 ! DARWIN-SAME: "[[object_file]]"
 ! DARWIN-F128NONE-NOT: libflang_rt.quadmath
 ! DARWIN-F128LIBQUADMATH-SAME: "-lflang_rt.quadmath" "--as-needed" "-lquadmath" "--no-as-needed"
 ! DARWIN-SAME: -lflang_rt.runtime
+! DARWIN-STATIC-FLANGRT: "{{.*}}{{\\|/}}libclang_rt.runtime_osx.a"
 
 ! HAIKU-LABEL:  "{{.*}}ld{{(\.exe)?}}"
 ! HAIKU-SAME: "[[object_file]]"
 ! HAIKU-F128NONE-NOT: libflang_rt.quadmath
 ! HAIKU-F128LIBQUADMATH-SAME: "-lflang_rt.quadmath" "--as-needed" "-lquadmath" "--no-as-needed"
 ! HAIKU-SAME: "-lflang_rt.runtime"
+! HAIKU-STATIC-FLANGRT: "{{.*}}{{\\|/}}libflang_rt.runtime.a"
 
 ! MINGW-LABEL:  "{{.*}}ld{{(\.exe)?}}"
 ! MINGW-SAME: "[[object_file]]"
 ! MINGW-F128NONE-NOT: libflang_rt.quadmath
 ! MINGW-F128LIBQUADMATH-SAME: "-lflang_rt.quadmath" "--as-needed" "-lquadmath" "--no-as-needed"
 ! MINGW-SAME: -lflang_rt.runtime
+! MINGW-STATIC-FLANGRT: "{{.*}}{{\\|/}}libflang_rt.runtime.a"
 
 ! NOTE: This also matches lld-link (when CLANG_DEFAULT_LINKER=lld) and
 !       any .exe suffix that is added when resolving to the full path of
diff --git a/flang/test/Driver/omp-driver-offload.f90 b/flang/test/Driver/omp-driver-offload.f90
index 13d605484506..335bfad4b188 100644
--- a/flang/test/Driver/omp-driver-offload.f90
+++ b/flang/test/Driver/omp-driver-offload.f90
@@ -184,7 +184,7 @@
 ! RUN: %flang -### -target x86_64-pc-linux-gnu -fopenmp --offload-arch=gfx900 \
 ! RUN:      --rocm-device-lib-path=%S/Inputs/rocm/amdgcn/bitcode %s  2>&1 | \
 ! RUN: FileCheck %s --check-prefix=ROCM-DEVICE-LIB
-! ROCM-DEVICE-LIB: "-fc1" {{.*}}ocml.bc"{{.*}}oclc_unsafe_math_off.bc"{{.*}}oclc_finite_only_off.bc"{{.*}}oclc_correctly_rounded_sqrt_on.bc"{{.*}}oclc_wavefrontsize64_on.bc"{{.*}}oclc_isa_version_900.bc"
+! ROCM-DEVICE-LIB: "-fc1" {{.*}}ocml.bc"{{.*}}oclc_daz_opt_off.bc"{{.*}}oclc_unsafe_math_off.bc"{{.*}}oclc_finite_only_off.bc"{{.*}}oclc_correctly_rounded_sqrt_on.bc"{{.*}}oclc_wavefrontsize64_on.bc"{{.*}}oclc_isa_version_900.bc"
 
 ! Test -fopenmp-force-usm option without offload
 ! RUN: %flang -S -### %s -o %t 2>&1 \
diff --git a/libcxx/include/__bit/popcount.h b/libcxx/include/__bit/popcount.h
index b1d93caab008..622c8efd7938 100644
--- a/libcxx/include/__bit/popcount.h
+++ b/libcxx/include/__bit/popcount.h
@@ -6,9 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO: __builtin_popcountg is available since Clang 19 and GCC 14. When support for older versions is dropped, we can
-//  refactor this code to exclusively use __builtin_popcountg.
-
 #ifndef _LIBCPP___BIT_POPCOUNT_H
 #define _LIBCPP___BIT_POPCOUNT_H
 
@@ -27,50 +24,10 @@ _LIBCPP_PUSH_MACROS
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int __libcpp_popcount(unsigned __x) _NOEXCEPT {
-  return __builtin_popcount(__x);
-}
-
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int __libcpp_popcount(unsigned long __x) _NOEXCEPT {
-  return __builtin_popcountl(__x);
-}
-
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int __libcpp_popcount(unsigned long long __x) _NOEXCEPT {
-  return __builtin_popcountll(__x);
-}
-
-template <class _Tp>
-[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int __popcount_impl(_Tp __t) _NOEXCEPT {
-  if _LIBCPP_CONSTEXPR (sizeof(_Tp) <= sizeof(unsigned int)) {
-    return std::__libcpp_popcount(static_cast<unsigned int>(__t));
-  } else if _LIBCPP_CONSTEXPR (sizeof(_Tp) <= sizeof(unsigned long)) {
-    return std::__libcpp_popcount(static_cast<unsigned long>(__t));
-  } else if _LIBCPP_CONSTEXPR (sizeof(_Tp) <= sizeof(unsigned long long)) {
-    return std::__libcpp_popcount(static_cast<unsigned long long>(__t));
-  } else {
-#if _LIBCPP_STD_VER == 11
-    return __t != 0 ? std::__libcpp_popcount(static_cast<unsigned long long>(__t)) +
-                          std::__popcount_impl<_Tp>(__t >> numeric_limits<unsigned long long>::digits)
-                    : 0;
-#else
-    int __ret = 0;
-    while (__t != 0) {
-      __ret += std::__libcpp_popcount(static_cast<unsigned long long>(__t));
-      __t >>= std::numeric_limits<unsigned long long>::digits;
-    }
-    return __ret;
-#endif
-  }
-}
-
 template <class _Tp>
 [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int __popcount(_Tp __t) _NOEXCEPT {
   static_assert(is_unsigned<_Tp>::value, "__popcount only works with unsigned types");
-#if __has_builtin(__builtin_popcountg) // TODO (LLVM 21): This can be dropped once we only support Clang >= 19.
   return __builtin_popcountg(__t);
-#else
-  return std::__popcount_impl(__t);
-#endif
 }
 
 #if _LIBCPP_STD_VER >= 20
diff --git a/libcxx/include/__stop_token/atomic_unique_lock.h b/libcxx/include/__stop_token/atomic_unique_lock.h
index a698260ac7bb..05e8f223167f 100644
--- a/libcxx/include/__stop_token/atomic_unique_lock.h
+++ b/libcxx/include/__stop_token/atomic_unique_lock.h
@@ -28,7 +28,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 // and LockedBit is the value of State when the lock bit is set, e.g  1 << 2
 template <class _State, _State _LockedBit>
 class _LIBCPP_AVAILABILITY_SYNC __atomic_unique_lock {
-  static_assert(std::__libcpp_popcount(static_cast<unsigned long long>(_LockedBit)) == 1,
+  static_assert(std::__popcount(static_cast<unsigned long long>(_LockedBit)) == 1,
                 "LockedBit must be an integer where only one bit is set");
 
   std::atomic<_State>& __state_;
diff --git a/libunwind/docs/BuildingLibunwind.rst b/libunwind/docs/BuildingLibunwind.rst
index 8b4f1207d4ba..c231587fd502 100644
--- a/libunwind/docs/BuildingLibunwind.rst
+++ b/libunwind/docs/BuildingLibunwind.rst
@@ -91,7 +91,7 @@ libunwind specific options
 
 .. option:: LIBUNWIND_ENABLE_WERROR:BOOL
 
-  **Default**: ``ON``
+  **Default**: ``OFF``
 
   Compile with -Werror
 
diff --git a/lldb/include/lldb/Symbol/SymbolContext.h b/lldb/include/lldb/Symbol/SymbolContext.h
index 8b6317c6f33c..4f8405f1f0db 100644
--- a/lldb/include/lldb/Symbol/SymbolContext.h
+++ b/lldb/include/lldb/Symbol/SymbolContext.h
@@ -307,6 +307,13 @@ public:
                                SymbolContext &next_frame_sc,
                                Address &inlined_frame_addr) const;
 
+  /// If available, will return the function name according to the specified
+  /// mangling preference. If this object represents an inlined function,
+  /// returns the name of the inlined function. Returns nullptr if no function
+  /// name could be determined.
+  const char *GetPossiblyInlinedFunctionName(
+      Mangled::NamePreference mangling_preference) const;
+
   // Member variables
   lldb::TargetSP target_sp; ///< The Target for a given query
   lldb::ModuleSP module_sp; ///< The Module for a given query
diff --git a/lldb/include/lldb/Target/Language.h b/lldb/include/lldb/Target/Language.h
index b699a90aff8e..da2c2cc451da 100644
--- a/lldb/include/lldb/Target/Language.h
+++ b/lldb/include/lldb/Target/Language.h
@@ -268,7 +268,7 @@ public:
   // the reference has never been assigned
   virtual bool IsUninitializedReference(ValueObject &valobj);
 
-  virtual bool GetFunctionDisplayName(const SymbolContext *sc,
+  virtual bool GetFunctionDisplayName(const SymbolContext &sc,
                                       const ExecutionContext *exe_ctx,
                                       FunctionNameRepresentation representation,
                                       Stream &s);
diff --git a/lldb/source/Core/FormatEntity.cpp b/lldb/source/Core/FormatEntity.cpp
index a9370595c11e..23e5999bd80c 100644
--- a/lldb/source/Core/FormatEntity.cpp
+++ b/lldb/source/Core/FormatEntity.cpp
@@ -1147,19 +1147,6 @@ static void PrettyPrintFunctionNameWithArgs(Stream &out_stream,
     out_stream.PutChar(')');
 }
 
-static void FormatInlinedBlock(Stream &out_stream, Block *block) {
-  if (!block)
-    return;
-  Block *inline_block = block->GetContainingInlinedBlock();
-  if (inline_block) {
-    if (const InlineFunctionInfo *inline_info =
-            inline_block->GetInlinedFunctionInfo()) {
-      out_stream.PutCString(" [inlined] ");
-      inline_info->GetName().Dump(&out_stream);
-    }
-  }
-}
-
 static VariableListSP GetFunctionVariableList(const SymbolContext &sc) {
   assert(sc.function);
 
@@ -1170,22 +1157,6 @@ static VariableListSP GetFunctionVariableList(const SymbolContext &sc) {
   return sc.function->GetBlock(true).GetBlockVariableList(true);
 }
 
-static char const *GetInlinedFunctionName(const SymbolContext &sc) {
-  if (!sc.block)
-    return nullptr;
-
-  const Block *inline_block = sc.block->GetContainingInlinedBlock();
-  if (!inline_block)
-    return nullptr;
-
-  const InlineFunctionInfo *inline_info =
-      inline_block->GetInlinedFunctionInfo();
-  if (!inline_info)
-    return nullptr;
-
-  return inline_info->GetName().AsCString(nullptr);
-}
-
 static bool PrintFunctionNameWithArgs(Stream &s,
                                       const ExecutionContext *exe_ctx,
                                       const SymbolContext &sc) {
@@ -1194,16 +1165,11 @@ static bool PrintFunctionNameWithArgs(Stream &s,
   ExecutionContextScope *exe_scope =
       exe_ctx ? exe_ctx->GetBestExecutionContextScope() : nullptr;
 
-  const char *cstr = sc.function->GetName().AsCString(nullptr);
+  const char *cstr =
+      sc.GetPossiblyInlinedFunctionName(Mangled::ePreferDemangled);
   if (!cstr)
     return false;
 
-  if (const char *inlined_name = GetInlinedFunctionName(sc)) {
-    s.PutCString(cstr);
-    s.PutCString(" [inlined] ");
-    cstr = inlined_name;
-  }
-
   VariableList args;
   if (auto variable_list_sp = GetFunctionVariableList(sc))
     variable_list_sp->AppendVariablesWithScope(eValueTypeVariableArgument,
@@ -1218,6 +1184,40 @@ static bool PrintFunctionNameWithArgs(Stream &s,
   return true;
 }
 
+static bool HandleFunctionNameWithArgs(Stream &s,const ExecutionContext *exe_ctx,
+                                       const SymbolContext &sc) {
+  Language *language_plugin = nullptr;
+  bool language_plugin_handled = false;
+  StreamString ss;
+  if (sc.function)
+    language_plugin = Language::FindPlugin(sc.function->GetLanguage());
+  else if (sc.symbol)
+    language_plugin = Language::FindPlugin(sc.symbol->GetLanguage());
+
+  if (language_plugin)
+    language_plugin_handled = language_plugin->GetFunctionDisplayName(
+        sc, exe_ctx, Language::FunctionNameRepresentation::eNameWithArgs, ss);
+
+  if (language_plugin_handled) {
+    s << ss.GetString();
+    return true;
+  }
+
+  if (sc.function)
+    return PrintFunctionNameWithArgs(s, exe_ctx, sc);
+
+  if (!sc.symbol)
+    return false;
+
+  const char *cstr = sc.symbol->GetName().AsCString(nullptr);
+  if (!cstr)
+    return false;
+
+  s.PutCString(cstr);
+
+  return true;
+}
+
 bool FormatEntity::FormatStringRef(const llvm::StringRef &format_str, Stream &s,
                                    const SymbolContext *sc,
                                    const ExecutionContext *exe_ctx,
@@ -1719,63 +1719,24 @@ bool FormatEntity::Format(const Entry &entry, Stream &s,
 
     if (language_plugin)
       language_plugin_handled = language_plugin->GetFunctionDisplayName(
-          sc, exe_ctx, Language::FunctionNameRepresentation::eName, ss);
+          *sc, exe_ctx, Language::FunctionNameRepresentation::eName, ss);
 
     if (language_plugin_handled) {
       s << ss.GetString();
       return true;
-    } else {
-      const char *name = nullptr;
-      if (sc->function)
-        name = sc->function->GetName().AsCString(nullptr);
-      else if (sc->symbol)
-        name = sc->symbol->GetName().AsCString(nullptr);
-
-      if (name) {
-        s.PutCString(name);
-        FormatInlinedBlock(s, sc->block);
-        return true;
-      }
     }
-  }
-    return false;
 
-  case Entry::Type::FunctionNameNoArgs: {
-    if (!sc)
+    const char *name = sc->GetPossiblyInlinedFunctionName(
+        Mangled::NamePreference::ePreferDemangled);
+    if (!name)
       return false;
 
-    Language *language_plugin = nullptr;
-    bool language_plugin_handled = false;
-    StreamString ss;
-    if (sc->function)
-      language_plugin = Language::FindPlugin(sc->function->GetLanguage());
-    else if (sc->symbol)
-      language_plugin = Language::FindPlugin(sc->symbol->GetLanguage());
-
-    if (language_plugin)
-      language_plugin_handled = language_plugin->GetFunctionDisplayName(
-          sc, exe_ctx, Language::FunctionNameRepresentation::eNameWithNoArgs,
-          ss);
+    s.PutCString(name);
 
-    if (language_plugin_handled) {
-      s << ss.GetString();
-      return true;
-    } else {
-      ConstString name;
-      if (sc->function)
-        name = sc->function->GetNameNoArguments();
-      else if (sc->symbol)
-        name = sc->symbol->GetNameNoArguments();
-      if (name) {
-        s.PutCString(name.GetCString());
-        FormatInlinedBlock(s, sc->block);
-        return true;
-      }
-    }
+    return true;
   }
-    return false;
 
-  case Entry::Type::FunctionNameWithArgs: {
+  case Entry::Type::FunctionNameNoArgs: {
     if (!sc)
       return false;
 
@@ -1789,44 +1750,42 @@ bool FormatEntity::Format(const Entry &entry, Stream &s,
 
     if (language_plugin)
       language_plugin_handled = language_plugin->GetFunctionDisplayName(
-          sc, exe_ctx, Language::FunctionNameRepresentation::eNameWithArgs, ss);
+          *sc, exe_ctx, Language::FunctionNameRepresentation::eNameWithNoArgs,
+          ss);
 
     if (language_plugin_handled) {
       s << ss.GetString();
       return true;
     }
 
-    if (sc->function)
-      return PrintFunctionNameWithArgs(s, exe_ctx, *sc);
-
-    if (!sc->symbol)
+    const char *name = sc->GetPossiblyInlinedFunctionName(
+        Mangled::NamePreference::ePreferDemangledWithoutArguments);
+    if (!name)
       return false;
 
-    const char *cstr = sc->symbol->GetName().AsCString(nullptr);
-    if (!cstr)
-      return false;
+    s.PutCString(name);
 
-    s.PutCString(cstr);
     return true;
   }
 
-  case Entry::Type::FunctionMangledName: {
+  case Entry::Type::FunctionNameWithArgs: {
     if (!sc)
       return false;
 
-    const char *name = nullptr;
-    if (sc->symbol)
-      name =
-          sc->symbol->GetMangled().GetName(Mangled::ePreferMangled).AsCString();
-    else if (sc->function)
-      name = sc->function->GetMangled()
-                 .GetName(Mangled::ePreferMangled)
-                 .AsCString();
+    return HandleFunctionNameWithArgs(s, exe_ctx, *sc);
+  }
 
+  case Entry::Type::FunctionMangledName: {
+    if (!sc)
+      return false;
+
+    const char *name = sc->GetPossiblyInlinedFunctionName(
+        Mangled::NamePreference::ePreferMangled);
     if (!name)
       return false;
+
     s.PutCString(name);
-    FormatInlinedBlock(s, sc->block);
+
     return true;
   }
   case Entry::Type::FunctionAddrOffset:
diff --git a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
index a6fdf66f13e4..9bd48ec55022 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
@@ -1707,22 +1707,6 @@ static VariableListSP GetFunctionVariableList(const SymbolContext &sc) {
   return sc.function->GetBlock(true).GetBlockVariableList(true);
 }
 
-static char const *GetInlinedFunctionName(const SymbolContext &sc) {
-  if (!sc.block)
-    return nullptr;
-
-  const Block *inline_block = sc.block->GetContainingInlinedBlock();
-  if (!inline_block)
-    return nullptr;
-
-  const InlineFunctionInfo *inline_info =
-      inline_block->GetInlinedFunctionInfo();
-  if (!inline_info)
-    return nullptr;
-
-  return inline_info->GetName().AsCString(nullptr);
-}
-
 static bool PrintFunctionNameWithArgs(Stream &s,
                                       const ExecutionContext *exe_ctx,
                                       const SymbolContext &sc) {
@@ -1731,16 +1715,11 @@ static bool PrintFunctionNameWithArgs(Stream &s,
   ExecutionContextScope *exe_scope =
       exe_ctx ? exe_ctx->GetBestExecutionContextScope() : nullptr;
 
-  const char *cstr = sc.function->GetName().AsCString(nullptr);
+  const char *cstr = sc.GetPossiblyInlinedFunctionName(
+      Mangled::NamePreference::ePreferDemangled);
   if (!cstr)
     return false;
 
-  if (const char *inlined_name = GetInlinedFunctionName(sc)) {
-    s.PutCString(cstr);
-    s.PutCString(" [inlined] ");
-    cstr = inlined_name;
-  }
-
   VariableList args;
   if (auto variable_list_sp = GetFunctionVariableList(sc))
     variable_list_sp->AppendVariablesWithScope(eValueTypeVariableArgument,
@@ -1757,20 +1736,18 @@ static bool PrintFunctionNameWithArgs(Stream &s,
 }
 
 bool CPlusPlusLanguage::GetFunctionDisplayName(
-    const SymbolContext *sc, const ExecutionContext *exe_ctx,
+    const SymbolContext &sc, const ExecutionContext *exe_ctx,
     FunctionNameRepresentation representation, Stream &s) {
   switch (representation) {
   case FunctionNameRepresentation::eNameWithArgs: {
-    assert(sc);
-
     // Print the function name with arguments in it
-    if (sc->function)
-      return PrintFunctionNameWithArgs(s, exe_ctx, *sc);
+    if (sc.function)
+      return PrintFunctionNameWithArgs(s, exe_ctx, sc);
 
-    if (!sc->symbol)
+    if (!sc.symbol)
       return false;
 
-    const char *cstr = sc->symbol->GetName().AsCString(nullptr);
+    const char *cstr = sc.symbol->GetName().AsCString(nullptr);
     if (!cstr)
       return false;
 
diff --git a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.h b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.h
index 623d481bf117..54f5a94388b9 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.h
+++ b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.h
@@ -138,7 +138,7 @@ public:
   ConstString
   GetDemangledFunctionNameWithoutArguments(Mangled mangled) const override;
 
-  bool GetFunctionDisplayName(const SymbolContext *sc,
+  bool GetFunctionDisplayName(const SymbolContext &sc,
                               const ExecutionContext *exe_ctx,
                               FunctionNameRepresentation representation,
                               Stream &s) override;
diff --git a/lldb/source/Symbol/SymbolContext.cpp b/lldb/source/Symbol/SymbolContext.cpp
index 183947a69436..a9626bbc3777 100644
--- a/lldb/source/Symbol/SymbolContext.cpp
+++ b/lldb/source/Symbol/SymbolContext.cpp
@@ -872,6 +872,36 @@ const Symbol *SymbolContext::FindBestGlobalDataSymbol(ConstString name,
   return nullptr; // no error; we just didn't find anything
 }
 
+char const *SymbolContext::GetPossiblyInlinedFunctionName(
+    Mangled::NamePreference mangling_preference) const {
+  const char *name = nullptr;
+  if (function)
+    name = function->GetMangled().GetName(mangling_preference).AsCString();
+  else if (symbol)
+    name = symbol->GetMangled().GetName(mangling_preference).AsCString();
+
+  if (!block)
+    return name;
+
+  const Block *inline_block = block->GetContainingInlinedBlock();
+  if (!inline_block)
+    return name;
+
+  const InlineFunctionInfo *inline_info =
+      inline_block->GetInlinedFunctionInfo();
+  if (!inline_info)
+    return name;
+
+  // If we do have an inlined frame name, return that.
+  if (char const *inline_name =
+          inline_info->GetMangled().GetName(mangling_preference).AsCString())
+    return inline_name;
+
+  // Sometimes an inline frame may not have mangling information,
+  // but does have a valid name.
+  return inline_info->GetName().AsCString();
+}
+
 //
 //  SymbolContextSpecifier
 //
diff --git a/lldb/source/Target/Language.cpp b/lldb/source/Target/Language.cpp
index a75894ffa4b3..86754c251cd9 100644
--- a/lldb/source/Target/Language.cpp
+++ b/lldb/source/Target/Language.cpp
@@ -510,7 +510,7 @@ bool Language::IsNilReference(ValueObject &valobj) { return false; }
 
 bool Language::IsUninitializedReference(ValueObject &valobj) { return false; }
 
-bool Language::GetFunctionDisplayName(const SymbolContext *sc,
+bool Language::GetFunctionDisplayName(const SymbolContext &sc,
                                       const ExecutionContext *exe_ctx,
                                       FunctionNameRepresentation representation,
                                       Stream &s) {
diff --git a/lldb/test/API/functionalities/param_entry_vals/basic_entry_values/main.cpp b/lldb/test/API/functionalities/param_entry_vals/basic_entry_values/main.cpp
index 7ad72b4880d7..64e2a5b47967 100644
--- a/lldb/test/API/functionalities/param_entry_vals/basic_entry_values/main.cpp
+++ b/lldb/test/API/functionalities/param_entry_vals/basic_entry_values/main.cpp
@@ -70,8 +70,8 @@ __attribute__((noinline)) void func6(int &sink, int x) {
 __attribute__((noinline)) void func7(int &sink, int x) {
   //% self.filecheck("bt", "main.cpp", "-check-prefix=FUNC7-BT")
   // FUNC7-BT: func7
-  // FUNC7-BT-NEXT: [inlined] func8_inlined
-  // FUNC7-BT-NEXT: [inlined] func9_inlined
+  // FUNC7-BT-NEXT: func8_inlined
+  // FUNC7-BT-NEXT: func9_inlined
   // FUNC7-BT-NEXT: func10
   use<int &, int>(sink, x);
   use<int &, int>(dummy, 0);
diff --git a/lldb/test/API/functionalities/tail_call_frames/inlining_and_tail_calls/main.cpp b/lldb/test/API/functionalities/tail_call_frames/inlining_and_tail_calls/main.cpp
index 9829e0246fc2..0a7d365d776c 100644
--- a/lldb/test/API/functionalities/tail_call_frames/inlining_and_tail_calls/main.cpp
+++ b/lldb/test/API/functionalities/tail_call_frames/inlining_and_tail_calls/main.cpp
@@ -1,13 +1,13 @@
 volatile int x;
 
+// clang-format off
 void __attribute__((noinline)) tail_call_sink() {
   x++; //% self.filecheck("bt", "main.cpp", "-check-prefix=TAIL-CALL-SINK")
   // TAIL-CALL-SINK: frame #0: 0x{{[0-9a-f]+}} a.out`tail_call_sink() at main.cpp:[[@LINE-1]]:4
-  // TAIL-CALL-SINK-NEXT: func3{{.*}} [artificial]
+  // TAIL-CALL-SINK-NEXT: inlinable_function_which_tail_calls() at main.cpp{{.*}} [artificial]
   // TAIL-CALL-SINK-NEXT: main{{.*}}
-
-  // TODO: The backtrace should include inlinable_function_which_tail_calls.
 }
+// clang-format on
 
 void __attribute__((always_inline)) inlinable_function_which_tail_calls() {
   tail_call_sink();
@@ -17,13 +17,15 @@ void __attribute__((noinline)) func3() {
   inlinable_function_which_tail_calls();
 }
 
+// clang-format off
 void __attribute__((always_inline)) inline_sink() {
   x++; //% self.filecheck("bt", "main.cpp", "-check-prefix=INLINE-SINK")
-  // INLINE-SINK: frame #0: 0x{{[0-9a-f]+}} a.out`func2() [inlined] inline_sink() at main.cpp:[[@LINE-1]]:4
+  // INLINE-SINK: frame #0: 0x{{[0-9a-f]+}} a.out`inline_sink() at main.cpp:[[@LINE-1]]:4
   // INLINE-SINK-NEXT: func2{{.*}}
   // INLINE-SINK-NEXT: func1{{.*}} [artificial]
   // INLINE-SINK-NEXT: main{{.*}}
 }
+// clang-format on
 
 void __attribute__((noinline)) func2() { inline_sink(); /* inlined */ }
 
diff --git a/lldb/test/API/python_api/target/read-instructions-flavor/TestTargetReadInstructionsFlavor.py b/lldb/test/API/python_api/target/read-instructions-flavor/TestTargetReadInstructionsFlavor.py
index 12805985798d..f488d4f421c9 100644
--- a/lldb/test/API/python_api/target/read-instructions-flavor/TestTargetReadInstructionsFlavor.py
+++ b/lldb/test/API/python_api/target/read-instructions-flavor/TestTargetReadInstructionsFlavor.py
@@ -7,6 +7,7 @@ from lldbsuite.test.lldbtest import *
 
 
 class TargetReadInstructionsFlavor(TestBase):
+    @skipIfDarwin
     @skipIfWindows
     @skipIf(archs=no_match(["x86_64", "x86", "i386"]))
     def test_read_instructions_with_flavor(self):
diff --git a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-max-depth.test b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-max-depth.test
index 0c3275c571b3..2ea6594643c9 100644
--- a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-max-depth.test
+++ b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-max-depth.test
@@ -12,5 +12,5 @@ run
 frame recognizer info 0
 # CHECK: frame 0 is recognized by Verbose Trap StackFrame Recognizer
 frame info
-# CHECK: frame #0: {{.*}}`std::recursively_aborts(int) {{.*}} at verbose_trap-in-stl-max-depth.cpp
+# CHECK: frame #0: {{.*}}`__clang_trap_msg$Error$max depth at verbose_trap-in-stl-max-depth.cpp
 q
diff --git a/lldb/test/Shell/Settings/TestFrameFormatName.test b/lldb/test/Shell/Settings/TestFrameFormatName.test
index caa3242527c6..110daceb47b4 100644
--- a/lldb/test/Shell/Settings/TestFrameFormatName.test
+++ b/lldb/test/Shell/Settings/TestFrameFormatName.test
@@ -30,7 +30,7 @@ c
 c
 # NAME_WITH_ARGS: frame Foo::returns_func_ptr<int>(this={{.*}}, (null)={{.*}})
 c
-# NAME_WITH_ARGS: frame main [inlined] inlined_foo(str="bar")
+# NAME_WITH_ARGS: frame inlined_foo(str="bar")
 q
 
 #--- name.input
@@ -38,18 +38,18 @@ q
 settings set -f frame-format "frame ${function.name}\n"
 break set -n inlined_foo
 run
-# NAME: frame main [inlined] inlined_foo(char const*)
+# NAME: frame inlined_foo(char const*)
 
 #--- name_without_args.input
 # RUN: %lldb -b -s %t/name_without_args.input %t.out | FileCheck %s --check-prefix=NAME_WITHOUT_ARGS
 settings set -f frame-format "frame ${function.name-without-args}\n"
 break set -n inlined_foo
 run
-# NAME_WITHOUT_ARGS: frame main [inlined] inlined_foo(char const*)
+# NAME_WITHOUT_ARGS: frame inlined_foo
 
 #--- mangled_name.input
 # RUN: %lldb -b -s %t/mangled_name.input %t.out | FileCheck %s --check-prefix=MANGLED_NAME
 settings set -f frame-format "frame ${function.mangled-name}\n"
 break set -n inlined_foo
 run
-# MANGLED_NAME: frame main [inlined] inlined_foo(char const*)
+# MANGLED_NAME: frame _Z11inlined_fooPKc
diff --git a/lldb/test/Shell/SymbolFile/NativePDB/inline_sites_live.cpp b/lldb/test/Shell/SymbolFile/NativePDB/inline_sites_live.cpp
index 906f3d7dff0a..4a06e6350b00 100644
--- a/lldb/test/Shell/SymbolFile/NativePDB/inline_sites_live.cpp
+++ b/lldb/test/Shell/SymbolFile/NativePDB/inline_sites_live.cpp
@@ -23,11 +23,11 @@ int main(int argc, char** argv) {
 }
 
 // CHECK:      * thread #1, {{.*}}stop reason = breakpoint 1
-// CHECK-NEXT:    frame #0: {{.*}}`main [inlined] bar(param=2)
+// CHECK-NEXT:    frame #0: {{.*}}`bar(param=2)
 // CHECK:      (lldb) expression param
 // CHECK-NEXT: (int) $0 = 2
 // CHECK:      * thread #1, {{.*}}stop reason = breakpoint 2
-// CHECK-NEXT:    frame #0: {{.*}}`main [inlined] foo(param=1)
+// CHECK-NEXT:    frame #0: {{.*}}`foo(param=1)
 // CHECK:      (lldb) expression param
 // CHECK-NEXT: (int) $1 = 1
 // CHECK-NEXT: (lldb) expression local
diff --git a/llvm/cmake/modules/TableGen.cmake b/llvm/cmake/modules/TableGen.cmake
index ffcc718b4777..bf914c379e80 100644
--- a/llvm/cmake/modules/TableGen.cmake
+++ b/llvm/cmake/modules/TableGen.cmake
@@ -68,7 +68,9 @@ function(tablegen project ofn)
   # char literals, instead. If we're cross-compiling, then conservatively assume
   # that the source might be consumed by MSVC.
   # [1] https://docs.microsoft.com/en-us/cpp/cpp/compiler-limits?view=vs-2017
-  if (MSVC AND project STREQUAL LLVM)
+  # Don't pass this flag to mlir-src-sharder, since it doesn't support the
+  # flag, and it doesn't need it.
+  if (MSVC AND NOT "${project}" STREQUAL "MLIR_SRC_SHARDER")
     list(APPEND LLVM_TABLEGEN_FLAGS "--long-string-literals=0")
   endif()
   if (CMAKE_GENERATOR MATCHES "Visual Studio")
diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
index 25fc8a00f7dd..01e51accec9d 100644
--- a/llvm/include/llvm/ADT/APFloat.h
+++ b/llvm/include/llvm/ADT/APFloat.h
@@ -354,6 +354,7 @@ struct APFloatBase {
   static bool semanticsHasInf(const fltSemantics &);
   static bool semanticsHasNaN(const fltSemantics &);
   static bool isIEEELikeFP(const fltSemantics &);
+  static bool hasSignBitInMSB(const fltSemantics &);
 
   // Returns true if any number described by \p Src can be precisely represented
   // by a normal (not subnormal) value in \p Dst.
diff --git a/llvm/include/llvm/CodeGen/BranchRelaxation.h b/llvm/include/llvm/CodeGen/BranchRelaxation.h
new file mode 100644
index 000000000000..2007cf05b3aa
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/BranchRelaxation.h
@@ -0,0 +1,25 @@
+//===- llvm/CodeGen/BranchRelaxation.h --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_BRANCHRELAXATION_H
+#define LLVM_CODEGEN_BRANCHRELAXATION_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+
+class BranchRelaxationPass : public PassInfoMixin<BranchRelaxationPass> {
+public:
+  PreservedAnalyses run(MachineFunction &MF,
+                        MachineFunctionAnalysisManager &MFAM);
+  static bool isRequired() { return true; }
+};
+
+} // namespace llvm
+
+#endif // LLVM_CODEGEN_BRANCHRELAXATION_H
diff --git a/llvm/include/llvm/CodeGen/RemoveLoadsIntoFakeUses.h b/llvm/include/llvm/CodeGen/RemoveLoadsIntoFakeUses.h
new file mode 100644
index 000000000000..bbd5b8b430bf
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/RemoveLoadsIntoFakeUses.h
@@ -0,0 +1,30 @@
+//===- llvm/CodeGen/RemoveLoadsIntoFakeUses.h -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_REMOVELOADSINTOFAKEUSES_H
+#define LLVM_CODEGEN_REMOVELOADSINTOFAKEUSES_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+
+class RemoveLoadsIntoFakeUsesPass
+    : public PassInfoMixin<RemoveLoadsIntoFakeUsesPass> {
+public:
+  PreservedAnalyses run(MachineFunction &MF,
+                        MachineFunctionAnalysisManager &MFAM);
+
+  MachineFunctionProperties getRequiredProperties() const {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+};
+
+} // namespace llvm
+
+#endif // LLVM_CODEGEN_REMOVELOADSINTOFAKEUSES_H
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index 6180c53a9a94..ab3eaa92548c 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -467,10 +467,7 @@ public:
 
   /// Returns true if Reg contains RegUnit.
   bool hasRegUnit(MCRegister Reg, MCRegUnit RegUnit) const {
-    for (MCRegUnit Unit : regunits(Reg))
-      if (Unit == RegUnit)
-        return true;
-    return false;
+    return llvm::is_contained(regunits(Reg), RegUnit);
   }
 
   /// Returns the original SrcReg unless it is the target of a copy-like
diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVDWARFReader.h b/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVDWARFReader.h
index fdc97249d8e5..aa47bd9cd2cd 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVDWARFReader.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVDWARFReader.h
@@ -102,11 +102,7 @@ class LVDWARFReader final : public LVBinaryReader {
   }
 
   // Remove offset from global map.
-  void removeGlobalOffset(LVOffset Offset) {
-    LVOffsetElementMap::iterator Iter = GlobalOffsets.find(Offset);
-    if (Iter != GlobalOffsets.end())
-      GlobalOffsets.erase(Iter);
-  }
+  void removeGlobalOffset(LVOffset Offset) { GlobalOffsets.erase(Offset); }
 
   // Get the location information for DW_AT_data_member_location.
   void processLocationMember(dwarf::Attribute Attr,
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 9fd5e7676b19..3242ccff7f87 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -61,7 +61,7 @@ void initializeBasicAAWrapperPassPass(PassRegistry &);
 void initializeBlockFrequencyInfoWrapperPassPass(PassRegistry &);
 void initializeBranchFolderLegacyPass(PassRegistry &);
 void initializeBranchProbabilityInfoWrapperPassPass(PassRegistry &);
-void initializeBranchRelaxationPass(PassRegistry &);
+void initializeBranchRelaxationLegacyPass(PassRegistry &);
 void initializeBreakCriticalEdgesPass(PassRegistry &);
 void initializeBreakFalseDepsPass(PassRegistry &);
 void initializeCanonicalizeFreezeInLoopsPass(PassRegistry &);
@@ -267,7 +267,7 @@ void initializeRegionOnlyViewerPass(PassRegistry &);
 void initializeRegionPrinterPass(PassRegistry &);
 void initializeRegionViewerPass(PassRegistry &);
 void initializeRegisterCoalescerLegacyPass(PassRegistry &);
-void initializeRemoveLoadsIntoFakeUsesPass(PassRegistry &);
+void initializeRemoveLoadsIntoFakeUsesLegacyPass(PassRegistry &);
 void initializeRemoveRedundantDebugValuesLegacyPass(PassRegistry &);
 void initializeRenameIndependentSubregsLegacyPass(PassRegistry &);
 void initializeReplaceWithVeclibLegacyPass(PassRegistry &);
diff --git a/llvm/include/llvm/MC/MCAsmBackend.h b/llvm/include/llvm/MC/MCAsmBackend.h
index 5953de30c2eb..10eabd41e80f 100644
--- a/llvm/include/llvm/MC/MCAsmBackend.h
+++ b/llvm/include/llvm/MC/MCAsmBackend.h
@@ -109,11 +109,10 @@ public:
     return false;
   }
 
-  virtual bool evaluateTargetFixup(const MCAssembler &Asm,
-                                   const MCFixup &Fixup, const MCFragment *DF,
-                                   const MCValue &Target,
-                                   const MCSubtargetInfo *STI, uint64_t &Value,
-                                   bool &WasForced) {
+  virtual bool evaluateTargetFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                                   const MCFragment *DF, const MCValue &Target,
+                                   const MCSubtargetInfo *STI,
+                                   uint64_t &Value) {
     llvm_unreachable("Need to implement hook if target has custom fixups");
   }
 
@@ -153,11 +152,9 @@ public:
 
   /// Target specific predicate for whether a given fixup requires the
   /// associated instruction to be relaxed.
-  virtual bool fixupNeedsRelaxationAdvanced(const MCAssembler &Asm,
-                                            const MCFixup &Fixup, bool Resolved,
-                                            uint64_t Value,
-                                            const MCRelaxableFragment *DF,
-                                            const bool WasForced) const;
+  virtual bool fixupNeedsRelaxationAdvanced(const MCAssembler &,
+                                            const MCFixup &, const MCValue &,
+                                            uint64_t, bool Resolved) const;
 
   /// Simple predicate for targets where !Resolved implies requiring relaxation
   virtual bool fixupNeedsRelaxation(const MCFixup &Fixup,
@@ -225,11 +222,6 @@ public:
     return 0;
   }
 
-  /// Check whether a given symbol has been flagged with MICROMIPS flag.
-  virtual bool isMicroMips(const MCSymbol *Sym) const {
-    return false;
-  }
-
   bool isDarwinCanonicalPersonality(const MCSymbol *Sym) const;
 };
 
diff --git a/llvm/include/llvm/MC/MCAssembler.h b/llvm/include/llvm/MC/MCAssembler.h
index a68eb49fda28..57143e3d59b4 100644
--- a/llvm/include/llvm/MC/MCAssembler.h
+++ b/llvm/include/llvm/MC/MCAssembler.h
@@ -95,14 +95,11 @@ private:
   /// evaluates to.
   /// \param Value [out] On return, the value of the fixup as currently laid
   /// out.
-  /// \param WasForced [out] On return, the value in the fixup is set to the
-  /// correct value if WasForced is true, even if evaluateFixup returns false.
-  /// \return Whether the fixup value was fully resolved. This is true if the
-  /// \p Value result is fixed, otherwise the value may change due to
+  /// \param RecordReloc Record relocation if needed.
   /// relocation.
   bool evaluateFixup(const MCFixup &Fixup, const MCFragment *DF,
                      MCValue &Target, const MCSubtargetInfo *STI,
-                     uint64_t &Value, bool &WasForced) const;
+                     uint64_t &Value, bool RecordReloc) const;
 
   /// Check whether a fixup can be satisfied, or whether it needs to be relaxed
   /// (increased in size, in order to hold its value correctly).
@@ -127,9 +124,6 @@ private:
   bool relaxCVDefRange(MCCVDefRangeFragment &DF);
   bool relaxPseudoProbeAddr(MCPseudoProbeAddrFragment &DF);
 
-  std::tuple<MCValue, uint64_t, bool>
-  handleFixup(MCFragment &F, const MCFixup &Fixup, const MCSubtargetInfo *STI);
-
 public:
   /// Construct a new assembler instance.
   //
diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index 25ca982916ff..6e2c2683730c 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -74,6 +74,7 @@
 #include "llvm/CodeGen/RegUsageInfoPropagate.h"
 #include "llvm/CodeGen/RegisterCoalescerPass.h"
 #include "llvm/CodeGen/RegisterUsageInfo.h"
+#include "llvm/CodeGen/RemoveLoadsIntoFakeUses.h"
 #include "llvm/CodeGen/RemoveRedundantDebugValues.h"
 #include "llvm/CodeGen/RenameIndependentSubregs.h"
 #include "llvm/CodeGen/ReplaceWithVeclib.h"
@@ -1003,6 +1004,7 @@ Error CodeGenPassBuilder<Derived, TargetMachineT>::addMachinePasses(
 
   addPass(FuncletLayoutPass());
 
+  addPass(RemoveLoadsIntoFakeUsesPass());
   addPass(StackMapLivenessPass());
   addPass(LiveDebugValuesPass(
       getTM<TargetMachine>().Options.ShouldEmitDebugEntryValues()));
diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def
index 232d5506f5b3..94febae16eee 100644
--- a/llvm/include/llvm/Passes/MachinePassRegistry.def
+++ b/llvm/include/llvm/Passes/MachinePassRegistry.def
@@ -137,6 +137,7 @@ MACHINE_FUNCTION_ANALYSIS("virtregmap", VirtRegMapAnalysis())
 #ifndef MACHINE_FUNCTION_PASS
 #define MACHINE_FUNCTION_PASS(NAME, CREATE_PASS)
 #endif
+MACHINE_FUNCTION_PASS("branch-relaxation", BranchRelaxationPass())
 MACHINE_FUNCTION_PASS("dead-mi-elimination", DeadMachineInstructionElimPass())
 MACHINE_FUNCTION_PASS("detect-dead-lanes", DetectDeadLanesPass())
 MACHINE_FUNCTION_PASS("early-ifcvt", EarlyIfConverterPass())
@@ -181,6 +182,7 @@ MACHINE_FUNCTION_PASS("reg-usage-collector", RegUsageInfoCollectorPass())
 MACHINE_FUNCTION_PASS("reg-usage-propagation", RegUsageInfoPropagationPass())
 MACHINE_FUNCTION_PASS("register-coalescer", RegisterCoalescerPass())
 MACHINE_FUNCTION_PASS("rename-independent-subregs", RenameIndependentSubregsPass())
+MACHINE_FUNCTION_PASS("remove-loads-into-fake-uses", RemoveLoadsIntoFakeUsesPass())
 MACHINE_FUNCTION_PASS("remove-redundant-debug-values", RemoveRedundantDebugValuesPass())
 MACHINE_FUNCTION_PASS("require-all-machine-function-properties",
                       RequireAllMachineFunctionPropertiesPass())
@@ -310,7 +312,6 @@ DUMMY_MACHINE_FUNCTION_PASS("ra-pbqp", RAPBQPPass)
 DUMMY_MACHINE_FUNCTION_PASS("regalloc", RegAllocPass)
 DUMMY_MACHINE_FUNCTION_PASS("regallocscoringpass", RegAllocScoringPass)
 DUMMY_MACHINE_FUNCTION_PASS("regbankselect", RegBankSelectPass)
-DUMMY_MACHINE_FUNCTION_PASS("remove-loads-into-fake-uses", RemoveLoadsIntoFakeUsesPass)
 DUMMY_MACHINE_FUNCTION_PASS("reset-machine-function", ResetMachineFunctionPass)
 DUMMY_MACHINE_FUNCTION_PASS("shrink-wrap", ShrinkWrapPass)
 DUMMY_MACHINE_FUNCTION_PASS("stack-frame-layout", StackFrameLayoutAnalysisPass)
diff --git a/llvm/include/llvm/TableGen/Main.h b/llvm/include/llvm/TableGen/Main.h
index e8c60e286990..5f68be188de7 100644
--- a/llvm/include/llvm/TableGen/Main.h
+++ b/llvm/include/llvm/TableGen/Main.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_TABLEGEN_MAIN_H
 #define LLVM_TABLEGEN_MAIN_H
 
+#include "llvm/Support/CommandLine.h"
 #include <functional>
 
 namespace llvm {
@@ -27,6 +28,10 @@ using TableGenMainFn = bool(raw_ostream &OS, const RecordKeeper &Records);
 int TableGenMain(const char *argv0,
                  std::function<TableGenMainFn> MainFn = nullptr);
 
+/// Controls emitting large character arrays as strings or character arrays.
+/// Typically set to false when building with MSVC.
+extern cl::opt<bool> EmitLongStrLiterals;
+
 } // end namespace llvm
 
 #endif // LLVM_TABLEGEN_MAIN_H
diff --git a/llvm/include/llvm/TableGen/StringToOffsetTable.h b/llvm/include/llvm/TableGen/StringToOffsetTable.h
index e716411514bd..21795644d4bd 100644
--- a/llvm/include/llvm/TableGen/StringToOffsetTable.h
+++ b/llvm/include/llvm/TableGen/StringToOffsetTable.h
@@ -12,8 +12,6 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/raw_ostream.h"
 #include <optional>
 
 namespace llvm {
@@ -36,17 +34,7 @@ public:
   bool empty() const { return StringOffset.empty(); }
   size_t size() const { return AggregateString.size(); }
 
-  unsigned GetOrAddStringOffset(StringRef Str, bool appendZero = true) {
-    auto [II, Inserted] = StringOffset.insert({Str, size()});
-    if (Inserted) {
-      // Add the string to the aggregate if this is the first time found.
-      AggregateString.append(Str.begin(), Str.end());
-      if (appendZero)
-        AggregateString += '\0';
-    }
-
-    return II->second;
-  }
+  unsigned GetOrAddStringOffset(StringRef Str, bool appendZero = true);
 
   // Returns the offset of `Str` in the table if its preset, else return
   // std::nullopt.
@@ -69,96 +57,10 @@ public:
   // `static` and `constexpr`. Both `Name` and (`Name` + "Storage") must be
   // valid identifiers to declare.
   void EmitStringTableDef(raw_ostream &OS, const Twine &Name,
-                          const Twine &Indent = "") const {
-    OS << formatv(R"(
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Woverlength-strings"
-#endif
-{0}static constexpr char {1}Storage[] = )",
-                  Indent, Name);
-
-    // MSVC silently miscompiles string literals longer than 64k in some
-    // circumstances. When the string table is longer, emit it as an array of
-    // character literals.
-    bool UseChars = AggregateString.size() > (64 * 1024);
-    OS << (UseChars ? "{\n" : "\n");
-
-    llvm::ListSeparator LineSep(UseChars ? ",\n" : "\n");
-    llvm::SmallVector<StringRef> Strings(split(AggregateString, '\0'));
-    // We should always have an empty string at the start, and because these are
-    // null terminators rather than separators, we'll have one at the end as
-    // well. Skip the end one.
-    assert(Strings.front().empty() && "Expected empty initial string!");
-    assert(Strings.back().empty() &&
-           "Expected empty string at the end due to terminators!");
-    Strings.pop_back();
-    for (StringRef Str : Strings) {
-      OS << LineSep << Indent << "  ";
-      // If we can, just emit this as a string literal to be concatenated.
-      if (!UseChars) {
-        OS << "\"";
-        OS.write_escaped(Str);
-        OS << "\\0\"";
-        continue;
-      }
-
-      llvm::ListSeparator CharSep(", ");
-      for (char C : Str) {
-        OS << CharSep << "'";
-        OS.write_escaped(StringRef(&C, 1));
-        OS << "'";
-      }
-      OS << CharSep << "'\\0'";
-    }
-    OS << LineSep << Indent << (UseChars ? "};" : "  ;");
-
-    OS << formatv(R"(
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
-
-{0}static constexpr llvm::StringTable {1} =
-{0}    {1}Storage;
-)",
-                  Indent, Name);
-  }
+                          const Twine &Indent = "") const;
 
   // Emit the string as one single string.
-  void EmitString(raw_ostream &O) const {
-    // Escape the string.
-    SmallString<256> EscapedStr;
-    raw_svector_ostream(EscapedStr).write_escaped(AggregateString);
-
-    O << "    \"";
-    unsigned CharsPrinted = 0;
-    for (unsigned i = 0, e = EscapedStr.size(); i != e; ++i) {
-      if (CharsPrinted > 70) {
-        O << "\"\n    \"";
-        CharsPrinted = 0;
-      }
-      O << EscapedStr[i];
-      ++CharsPrinted;
-
-      // Print escape sequences all together.
-      if (EscapedStr[i] != '\\')
-        continue;
-
-      assert(i + 1 < EscapedStr.size() && "Incomplete escape sequence!");
-      if (isDigit(EscapedStr[i + 1])) {
-        assert(isDigit(EscapedStr[i + 2]) && isDigit(EscapedStr[i + 3]) &&
-               "Expected 3 digit octal escape!");
-        O << EscapedStr[++i];
-        O << EscapedStr[++i];
-        O << EscapedStr[++i];
-        CharsPrinted += 3;
-      } else {
-        O << EscapedStr[++i];
-        ++CharsPrinted;
-      }
-    }
-    O << "\"";
-  }
+  void EmitString(raw_ostream &O) const;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Analysis/PHITransAddr.cpp b/llvm/lib/Analysis/PHITransAddr.cpp
index e42113db4278..276708c2ebf7 100644
--- a/llvm/lib/Analysis/PHITransAddr.cpp
+++ b/llvm/lib/Analysis/PHITransAddr.cpp
@@ -224,6 +224,9 @@ Value *PHITransAddr::translateSubExpr(Value *V, BasicBlock *CurBB,
 
     // Scan to see if we have this GEP available.
     Value *APHIOp = GEPOps[0];
+    if (isa<ConstantData>(APHIOp))
+      return nullptr;
+
     for (User *U : APHIOp->users()) {
       if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U))
         if (GEPI->getType() == GEP->getType() &&
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index c62ea1526981..d193c9e3210e 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -7841,7 +7841,7 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
               unsigned GCD = std::min(MulZeros, TZ);
               APInt DivAmt = APInt::getOneBitSet(BitWidth, TZ - GCD);
               SmallVector<const SCEV*, 4> MulOps;
-              MulOps.push_back(getConstant(OpC->getAPInt().lshr(GCD)));
+              MulOps.push_back(getConstant(OpC->getAPInt().ashr(GCD)));
               append_range(MulOps, LHSMul->operands().drop_front());
               auto *NewMul = getMulExpr(MulOps, LHSMul->getNoWrapFlags());
               ShiftedLHS = getUDivExpr(NewMul, getConstant(DivAmt));
diff --git a/llvm/lib/CodeGen/AsmPrinter/WinException.cpp b/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
index 062283975851..55d1350e446a 100644
--- a/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
@@ -162,8 +162,7 @@ void WinException::endFunction(const MachineFunction *MF) {
 
   if (!MF->getEHContTargets().empty()) {
     // Copy the function's EH Continuation targets to a module-level list.
-    EHContTargets.insert(EHContTargets.end(), MF->getEHContTargets().begin(),
-                         MF->getEHContTargets().end());
+    llvm::append_range(EHContTargets, MF->getEHContTargets());
   }
 }
 
@@ -292,8 +291,7 @@ void WinException::endFuncletImpl() {
 
     if (!MF->getEHContTargets().empty()) {
       // Copy the function's EH Continuation targets to a module-level list.
-      EHContTargets.insert(EHContTargets.end(), MF->getEHContTargets().begin(),
-                           MF->getEHContTargets().end());
+      llvm::append_range(EHContTargets, MF->getEHContTargets());
     }
 
     // Switch back to the funclet start .text section now that we are done
diff --git a/llvm/lib/CodeGen/BranchRelaxation.cpp b/llvm/lib/CodeGen/BranchRelaxation.cpp
index a762aab43ddd..fbdc784c928c 100644
--- a/llvm/lib/CodeGen/BranchRelaxation.cpp
+++ b/llvm/lib/CodeGen/BranchRelaxation.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/CodeGen/BranchRelaxation.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
@@ -44,7 +45,7 @@ STATISTIC(NumUnconditionalRelaxed, "Number of unconditional branches relaxed");
 
 namespace {
 
-class BranchRelaxation : public MachineFunctionPass {
+class BranchRelaxation {
   /// BasicBlockInfo - Information about the offset and size of a single
   /// basic block.
   struct BasicBlockInfo {
@@ -116,22 +117,30 @@ class BranchRelaxation : public MachineFunctionPass {
   void verify();
 
 public:
+  bool run(MachineFunction &MF);
+};
+
+class BranchRelaxationLegacy : public MachineFunctionPass {
+public:
   static char ID;
 
-  BranchRelaxation() : MachineFunctionPass(ID) {}
+  BranchRelaxationLegacy() : MachineFunctionPass(ID) {}
 
-  bool runOnMachineFunction(MachineFunction &MF) override;
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    return BranchRelaxation().run(MF);
+  }
 
   StringRef getPassName() const override { return BRANCH_RELAX_NAME; }
 };
 
 } // end anonymous namespace
 
-char BranchRelaxation::ID = 0;
+char BranchRelaxationLegacy::ID = 0;
 
-char &llvm::BranchRelaxationPassID = BranchRelaxation::ID;
+char &llvm::BranchRelaxationPassID = BranchRelaxationLegacy::ID;
 
-INITIALIZE_PASS(BranchRelaxation, DEBUG_TYPE, BRANCH_RELAX_NAME, false, false)
+INITIALIZE_PASS(BranchRelaxationLegacy, DEBUG_TYPE, BRANCH_RELAX_NAME, false,
+                false)
 
 /// verify - check BBOffsets, BBSizes, alignment of islands
 void BranchRelaxation::verify() {
@@ -744,7 +753,16 @@ bool BranchRelaxation::relaxBranchInstructions() {
   return Changed;
 }
 
-bool BranchRelaxation::runOnMachineFunction(MachineFunction &mf) {
+PreservedAnalyses
+BranchRelaxationPass::run(MachineFunction &MF,
+                          MachineFunctionAnalysisManager &MFAM) {
+  if (!BranchRelaxation().run(MF))
+    return PreservedAnalyses::all();
+
+  return getMachineFunctionPassPreservedAnalyses();
+}
+
+bool BranchRelaxation::run(MachineFunction &mf) {
   MF = &mf;
 
   LLVM_DEBUG(dbgs() << "***** BranchRelaxation *****\n");
diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp
index 8b777ed2bbc9..b77cefca00b7 100644
--- a/llvm/lib/CodeGen/CodeGen.cpp
+++ b/llvm/lib/CodeGen/CodeGen.cpp
@@ -23,7 +23,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeBasicBlockPathCloningPass(Registry);
   initializeBasicBlockSectionsPass(Registry);
   initializeBranchFolderLegacyPass(Registry);
-  initializeBranchRelaxationPass(Registry);
+  initializeBranchRelaxationLegacyPass(Registry);
   initializeBreakFalseDepsPass(Registry);
   initializeCallBrPreparePass(Registry);
   initializeCFGuardLongjmpPass(Registry);
@@ -117,7 +117,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeRegUsageInfoCollectorLegacyPass(Registry);
   initializeRegUsageInfoPropagationLegacyPass(Registry);
   initializeRegisterCoalescerLegacyPass(Registry);
-  initializeRemoveLoadsIntoFakeUsesPass(Registry);
+  initializeRemoveLoadsIntoFakeUsesLegacyPass(Registry);
   initializeRemoveRedundantDebugValuesLegacyPass(Registry);
   initializeRenameIndependentSubregsLegacyPass(Registry);
   initializeSafeStackLegacyPassPass(Registry);
diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index 4cd378f9aa59..a83982de14b2 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -2332,8 +2332,9 @@ void ComplexDeinterleavingGraph::replaceNodes() {
     } else if (RootNode->Operation ==
                ComplexDeinterleavingOperation::ReductionSingle) {
       auto *RootInst = cast<Instruction>(RootNode->Real);
-      ReductionInfo[RootInst].first->removeIncomingValue(BackEdge);
-      DeadInstrRoots.push_back(ReductionInfo[RootInst].second);
+      auto &Info = ReductionInfo[RootInst];
+      Info.first->removeIncomingValue(BackEdge);
+      DeadInstrRoots.push_back(Info.second);
     } else {
       assert(R && "Unable to find replacement for RootInstruction");
       DeadInstrRoots.push_back(RootInstruction);
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index ac68eb55a6fd..ee271234d311 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -1765,7 +1765,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
         LLT GCDTy = extractGCDType(WidenedXors, NarrowTy, LeftoverTy, Xor);
         buildLCMMergePieces(LeftoverTy, NarrowTy, GCDTy, WidenedXors,
                             /* PadStrategy = */ TargetOpcode::G_ZEXT);
-        Xors.insert(Xors.end(), WidenedXors.begin(), WidenedXors.end());
+        llvm::append_range(Xors, WidenedXors);
       }
 
       // Now, for each part we broke up, we know if they are equal/not equal
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index 1ac1a770ae72..df3dd4196548 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -1487,8 +1487,7 @@ void MachineLICMImpl::InitializeLoadsHoistableLoops() {
     auto *L = Worklist.pop_back_val();
     AllowedToHoistLoads[L] = true;
     LoopsInPreOrder.push_back(L);
-    Worklist.insert(Worklist.end(), L->getSubLoops().begin(),
-                    L->getSubLoops().end());
+    llvm::append_range(Worklist, L->getSubLoops());
   }
 
   // Going from the innermost to outermost loops, check if a loop has
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index dbd354f2ca2c..c27435aa2dae 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -3814,8 +3814,7 @@ bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
     // into an existing tracking collection, or insert a new one.
     RegIt = RegToPHIIdx.find(CP.getDstReg());
     if (RegIt != RegToPHIIdx.end())
-      RegIt->second.insert(RegIt->second.end(), InstrNums.begin(),
-                           InstrNums.end());
+      llvm::append_range(RegIt->second, InstrNums);
     else
       RegToPHIIdx.insert({CP.getDstReg(), InstrNums});
   }
diff --git a/llvm/lib/CodeGen/RemoveLoadsIntoFakeUses.cpp b/llvm/lib/CodeGen/RemoveLoadsIntoFakeUses.cpp
index 384a049acfe3..042fc13090ef 100644
--- a/llvm/lib/CodeGen/RemoveLoadsIntoFakeUses.cpp
+++ b/llvm/lib/CodeGen/RemoveLoadsIntoFakeUses.cpp
@@ -22,11 +22,13 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#include "llvm/CodeGen/RemoveLoadsIntoFakeUses.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LiveRegUnits.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachinePassManager.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/Function.h"
@@ -41,12 +43,13 @@ using namespace llvm;
 STATISTIC(NumLoadsDeleted, "Number of dead load instructions deleted");
 STATISTIC(NumFakeUsesDeleted, "Number of FAKE_USE instructions deleted");
 
-class RemoveLoadsIntoFakeUses : public MachineFunctionPass {
+class RemoveLoadsIntoFakeUsesLegacy : public MachineFunctionPass {
 public:
   static char ID;
 
-  RemoveLoadsIntoFakeUses() : MachineFunctionPass(ID) {
-    initializeRemoveLoadsIntoFakeUsesPass(*PassRegistry::getPassRegistry());
+  RemoveLoadsIntoFakeUsesLegacy() : MachineFunctionPass(ID) {
+    initializeRemoveLoadsIntoFakeUsesLegacyPass(
+        *PassRegistry::getPassRegistry());
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -66,21 +69,45 @@ public:
   bool runOnMachineFunction(MachineFunction &MF) override;
 };
 
-char RemoveLoadsIntoFakeUses::ID = 0;
-char &llvm::RemoveLoadsIntoFakeUsesID = RemoveLoadsIntoFakeUses::ID;
+struct RemoveLoadsIntoFakeUses {
+  bool run(MachineFunction &MF);
+};
+
+char RemoveLoadsIntoFakeUsesLegacy::ID = 0;
+char &llvm::RemoveLoadsIntoFakeUsesID = RemoveLoadsIntoFakeUsesLegacy::ID;
 
-INITIALIZE_PASS_BEGIN(RemoveLoadsIntoFakeUses, DEBUG_TYPE,
+INITIALIZE_PASS_BEGIN(RemoveLoadsIntoFakeUsesLegacy, DEBUG_TYPE,
                       "Remove Loads Into Fake Uses", false, false)
-INITIALIZE_PASS_END(RemoveLoadsIntoFakeUses, DEBUG_TYPE,
+INITIALIZE_PASS_END(RemoveLoadsIntoFakeUsesLegacy, DEBUG_TYPE,
                     "Remove Loads Into Fake Uses", false, false)
 
-bool RemoveLoadsIntoFakeUses::runOnMachineFunction(MachineFunction &MF) {
+bool RemoveLoadsIntoFakeUsesLegacy::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  return RemoveLoadsIntoFakeUses().run(MF);
+}
+
+PreservedAnalyses
+RemoveLoadsIntoFakeUsesPass::run(MachineFunction &MF,
+                                 MachineFunctionAnalysisManager &MFAM) {
+  MFPropsModifier _(*this, MF);
+
+  if (!RemoveLoadsIntoFakeUses().run(MF))
+    return PreservedAnalyses::all();
+
+  auto PA = getMachineFunctionPassPreservedAnalyses();
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
+
+bool RemoveLoadsIntoFakeUses::run(MachineFunction &MF) {
   // Skip this pass if we would use VarLoc-based LDV, as there may be DBG_VALUE
   // instructions of the restored values that would become invalid.
   if (!MF.useDebugInstrRef())
     return false;
   // Only run this for functions that have fake uses.
-  if (!MF.hasFakeUses() || skipFunction(MF.getFunction()))
+  if (!MF.hasFakeUses())
     return false;
 
   bool AnyChanges = false;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 5210372dd935..83fade45d189 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -530,7 +530,7 @@ SDNode *DAGTypeLegalizer::AnalyzeNewNode(SDNode *N) {
       NewOps.push_back(Op);
     } else if (Op != OrigOp) {
       // This is the first operand to change - add all operands so far.
-      NewOps.insert(NewOps.end(), N->op_begin(), N->op_begin() + i);
+      llvm::append_range(NewOps, N->ops().take_front(i));
       NewOps.push_back(Op);
     }
   }
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 083b984444bc..598de6b20775 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -2320,7 +2320,7 @@ void SelectionDAGISel::SelectInlineAsmMemoryOperands(std::vector<SDValue> &Ops,
                               SelOps.size());
       Flags.setMemConstraint(ConstraintID);
       Handles.emplace_back(CurDAG->getTargetConstant(Flags, DL, MVT::i32));
-      Handles.insert(Handles.end(), SelOps.begin(), SelOps.end());
+      llvm::append_range(Handles, SelOps);
       i += 2;
     }
   }
diff --git a/llvm/lib/CodeGen/WindowScheduler.cpp b/llvm/lib/CodeGen/WindowScheduler.cpp
index 95c86a9ac266..2492dfc3ca55 100644
--- a/llvm/lib/CodeGen/WindowScheduler.cpp
+++ b/llvm/lib/CodeGen/WindowScheduler.cpp
@@ -679,8 +679,7 @@ MachineInstr *WindowScheduler::getOriMI(MachineInstr *NewMI) {
 }
 
 unsigned WindowScheduler::getOriStage(MachineInstr *OriMI, unsigned Offset) {
-  assert(llvm::find(OriMIs, OriMI) != OriMIs.end() &&
-         "Cannot find OriMI in OriMIs!");
+  assert(llvm::is_contained(OriMIs, OriMI) && "Cannot find OriMI in OriMIs!");
   // If there is no instruction fold, all MI stages are 0.
   if (Offset == SchedPhiNum)
     return 0;
diff --git a/llvm/lib/ExecutionEngine/Orc/DebugObjectManagerPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/DebugObjectManagerPlugin.cpp
index 80b7452a0b22..15e583ca7685 100644
--- a/llvm/lib/ExecutionEngine/Orc/DebugObjectManagerPlugin.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/DebugObjectManagerPlugin.cpp
@@ -475,8 +475,9 @@ Error DebugObjectManagerPlugin::notifyEmitted(
         FinalizePromise.set_value(MR.withResourceKeyDo([&](ResourceKey K) {
           assert(PendingObjs.count(&MR) && "We still hold PendingObjsLock");
           std::lock_guard<std::mutex> Lock(RegisteredObjsLock);
-          RegisteredObjs[K].push_back(std::move(PendingObjs[&MR]));
-          PendingObjs.erase(&MR);
+          auto It = PendingObjs.find(&MR);
+          RegisteredObjs[K].push_back(std::move(It->second));
+          PendingObjs.erase(It);
         }));
       });
 
diff --git a/llvm/lib/ExecutionEngine/Orc/Shared/ObjectFormats.cpp b/llvm/lib/ExecutionEngine/Orc/Shared/ObjectFormats.cpp
index 3bd6c1e5be2c..d95f2f602fbc 100644
--- a/llvm/lib/ExecutionEngine/Orc/Shared/ObjectFormats.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Shared/ObjectFormats.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/Orc/Shared/ObjectFormats.h"
+#include "llvm/ADT/STLExtras.h"
 
 namespace llvm {
 namespace orc {
@@ -34,10 +35,7 @@ StringRef ELFThreadBSSSectionName = ".tbss";
 StringRef ELFThreadDataSectionName = ".tdata";
 
 bool isMachOInitializerSection(StringRef QualifiedName) {
-  for (auto &InitSection : MachOInitSectionNames)
-    if (InitSection == QualifiedName)
-      return true;
-  return false;
+  return llvm::is_contained(MachOInitSectionNames, QualifiedName);
 }
 
 bool isELFInitializerSection(StringRef SecName) {
diff --git a/llvm/lib/MC/MCAsmBackend.cpp b/llvm/lib/MC/MCAsmBackend.cpp
index 23cc134f65b5..85a208ffbce0 100644
--- a/llvm/lib/MC/MCAsmBackend.cpp
+++ b/llvm/lib/MC/MCAsmBackend.cpp
@@ -115,11 +115,10 @@ bool MCAsmBackend::shouldForceRelocation(const MCAssembler &, const MCFixup &,
   return Target.getSpecifier();
 }
 
-bool MCAsmBackend::fixupNeedsRelaxationAdvanced(const MCAssembler &Asm,
+bool MCAsmBackend::fixupNeedsRelaxationAdvanced(const MCAssembler &,
                                                 const MCFixup &Fixup,
-                                                bool Resolved, uint64_t Value,
-                                                const MCRelaxableFragment *DF,
-                                                const bool WasForced) const {
+                                                const MCValue &, uint64_t Value,
+                                                bool Resolved) const {
   if (!Resolved)
     return true;
   return fixupNeedsRelaxation(Fixup, Value);
diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp
index 934bdb40d530..4e925809d20b 100644
--- a/llvm/lib/MC/MCAssembler.cpp
+++ b/llvm/lib/MC/MCAssembler.cpp
@@ -138,7 +138,7 @@ bool MCAssembler::isThumbFunc(const MCSymbol *Symbol) const {
 
 bool MCAssembler::evaluateFixup(const MCFixup &Fixup, const MCFragment *DF,
                                 MCValue &Target, const MCSubtargetInfo *STI,
-                                uint64_t &Value, bool &WasForced) const {
+                                uint64_t &Value, bool RecordReloc) const {
   ++stats::evaluateFixup;
 
   // FIXME: This code has some duplication with recordRelocation. We should
@@ -150,47 +150,51 @@ bool MCAssembler::evaluateFixup(const MCFixup &Fixup, const MCFragment *DF,
   const MCExpr *Expr = Fixup.getValue();
   MCContext &Ctx = getContext();
   Value = 0;
-  WasForced = false;
   if (!Expr->evaluateAsRelocatable(Target, this)) {
     Ctx.reportError(Fixup.getLoc(), "expected relocatable expression");
     return true;
   }
 
-  unsigned FixupFlags = getBackend().getFixupKindInfo(Fixup.getKind()).Flags;
-  if (FixupFlags & MCFixupKindInfo::FKF_IsTarget)
-    return getBackend().evaluateTargetFixup(*this, Fixup, DF, Target, STI,
-                                            Value, WasForced);
-
-  const MCSymbol *Add = Target.getAddSym();
-  const MCSymbol *Sub = Target.getSubSym();
-  bool IsPCRel = FixupFlags & MCFixupKindInfo::FKF_IsPCRel;
   bool IsResolved = false;
-  if (!IsPCRel) {
-    IsResolved = Target.isAbsolute();
-  } else if (Add && !Sub && !Add->isUndefined() && !Add->isAbsolute()) {
-    IsResolved = (FixupFlags & MCFixupKindInfo::FKF_Constant) ||
-                 getWriter().isSymbolRefDifferenceFullyResolvedImpl(
-                     *this, *Add, *DF, false, true);
+  unsigned FixupFlags = getBackend().getFixupKindInfo(Fixup.getKind()).Flags;
+  if (FixupFlags & MCFixupKindInfo::FKF_IsTarget) {
+    IsResolved =
+        getBackend().evaluateTargetFixup(*this, Fixup, DF, Target, STI, Value);
+  } else {
+    const MCSymbol *Add = Target.getAddSym();
+    const MCSymbol *Sub = Target.getSubSym();
+    Value = Target.getConstant();
+    if (Add && Add->isDefined())
+      Value += getSymbolOffset(*Add);
+    if (Sub && Sub->isDefined())
+      Value -= getSymbolOffset(*Sub);
+
+    bool IsPCRel = FixupFlags & MCFixupKindInfo::FKF_IsPCRel;
+    bool ShouldAlignPC =
+        FixupFlags & MCFixupKindInfo::FKF_IsAlignedDownTo32Bits;
+    if (IsPCRel) {
+      uint64_t Offset = getFragmentOffset(*DF) + Fixup.getOffset();
+
+      // A number of ARM fixups in Thumb mode require that the effective PC
+      // address be determined as the 32-bit aligned version of the actual
+      // offset.
+      if (ShouldAlignPC)
+        Offset &= ~0x3;
+      Value -= Offset;
+
+      if (Add && !Sub && !Add->isUndefined() && !Add->isAbsolute()) {
+        IsResolved = (FixupFlags & MCFixupKindInfo::FKF_Constant) ||
+                     getWriter().isSymbolRefDifferenceFullyResolvedImpl(
+                         *this, *Add, *DF, false, true);
+      }
+    } else {
+      IsResolved = Target.isAbsolute();
+      assert(!ShouldAlignPC && "FKF_IsAlignedDownTo32Bits must be PC-relative");
+    }
   }
 
-  Value = Target.getConstant();
-  if (Add && Add->isDefined())
-    Value += getSymbolOffset(*Add);
-  if (Sub && Sub->isDefined())
-    Value -= getSymbolOffset(*Sub);
-
-  bool ShouldAlignPC = FixupFlags & MCFixupKindInfo::FKF_IsAlignedDownTo32Bits;
-  assert((ShouldAlignPC ? IsPCRel : true) &&
-    "FKF_IsAlignedDownTo32Bits is only allowed on PC-relative fixups!");
-
-  if (IsPCRel) {
-    uint64_t Offset = getFragmentOffset(*DF) + Fixup.getOffset();
-
-    // A number of ARM fixups in Thumb mode require that the effective PC
-    // address be determined as the 32-bit aligned version of the actual offset.
-    if (ShouldAlignPC) Offset &= ~0x3;
-    Value -= Offset;
-  }
+  if (!RecordReloc)
+    return IsResolved;
 
   // .reloc directive and the backend might force the relocation.
   // Backends that customize shouldForceRelocation generally just need the fixup
@@ -200,12 +204,12 @@ bool MCAssembler::evaluateFixup(const MCFixup &Fixup, const MCFragment *DF,
     auto TargetVal = Target;
     TargetVal.Cst = Value;
     if (Fixup.getKind() >= FirstLiteralRelocationKind ||
-        getBackend().shouldForceRelocation(*this, Fixup, TargetVal, STI)) {
+        getBackend().shouldForceRelocation(*this, Fixup, TargetVal, STI))
       IsResolved = false;
-      WasForced = true;
-    }
   }
-
+  if (!IsResolved)
+    getWriter().recordRelocation(const_cast<MCAssembler &>(*this), DF, Fixup,
+                                 Target, Value);
   return IsResolved;
 }
 
@@ -844,24 +848,6 @@ void MCAssembler::writeSectionData(raw_ostream &OS,
          OS.tell() - Start == getSectionAddressSize(*Sec));
 }
 
-std::tuple<MCValue, uint64_t, bool>
-MCAssembler::handleFixup(MCFragment &F, const MCFixup &Fixup,
-                         const MCSubtargetInfo *STI) {
-  // Evaluate the fixup.
-  MCValue Target;
-  uint64_t FixedValue;
-  bool WasForced;
-  bool IsResolved =
-      evaluateFixup(Fixup, &F, Target, STI, FixedValue, WasForced);
-  if (!IsResolved) {
-    // The fixup was unresolved, we need a relocation. Inform the object
-    // writer of the relocation, and give it an opportunity to adjust the
-    // fixup value if need be.
-    getWriter().recordRelocation(*this, &F, Fixup, Target, FixedValue);
-  }
-  return std::make_tuple(Target, FixedValue, IsResolved);
-}
-
 void MCAssembler::layout() {
   assert(getBackendPtr() && "Expected assembler backend");
   DEBUG_WITH_TYPE("mc-dump", {
@@ -987,10 +973,9 @@ void MCAssembler::layout() {
       }
       for (const MCFixup &Fixup : Fixups) {
         uint64_t FixedValue;
-        bool IsResolved;
         MCValue Target;
-        std::tie(Target, FixedValue, IsResolved) =
-            handleFixup(Frag, Fixup, STI);
+        bool IsResolved = evaluateFixup(Fixup, &Frag, Target, STI, FixedValue,
+                                        /*RecordReloc=*/true);
         getBackend().applyFixup(*this, Fixup, Target, Contents, FixedValue,
                                 IsResolved, STI);
       }
@@ -1012,11 +997,10 @@ bool MCAssembler::fixupNeedsRelaxation(const MCFixup &Fixup,
   assert(getBackendPtr() && "Expected assembler backend");
   MCValue Target;
   uint64_t Value;
-  bool WasForced;
   bool Resolved = evaluateFixup(Fixup, DF, Target, DF->getSubtargetInfo(),
-                                Value, WasForced);
-  return getBackend().fixupNeedsRelaxationAdvanced(*this, Fixup, Resolved,
-                                                   Value, DF, WasForced);
+                                Value, /*RecordReloc=*/false);
+  return getBackend().fixupNeedsRelaxationAdvanced(*this, Fixup, Target, Value,
+                                                   Resolved);
 }
 
 bool MCAssembler::fragmentNeedsRelaxation(const MCRelaxableFragment *F) const {
diff --git a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
index 5a785d7afd1e..b0ec215aec20 100644
--- a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
+++ b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
@@ -857,8 +857,7 @@ static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig,
           "cannot change section address in a non-relocatable file");
     StringMap<AddressUpdate> SectionsToUpdateAddress;
     for (const SectionPatternAddressUpdate &PatternUpdate :
-         make_range(Config.ChangeSectionAddress.rbegin(),
-                    Config.ChangeSectionAddress.rend())) {
+         reverse(Config.ChangeSectionAddress)) {
       for (SectionBase &Sec : Obj.sections()) {
         if (PatternUpdate.SectionPattern.matches(Sec.Name) &&
             SectionsToUpdateAddress.try_emplace(Sec.Name, PatternUpdate.Update)
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index b923181e9726..215355827337 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -82,6 +82,7 @@
 #include "llvm/CodeGen/AtomicExpand.h"
 #include "llvm/CodeGen/BasicBlockSectionsProfileReader.h"
 #include "llvm/CodeGen/BranchFoldingPass.h"
+#include "llvm/CodeGen/BranchRelaxation.h"
 #include "llvm/CodeGen/CallBrPrepare.h"
 #include "llvm/CodeGen/CodeGenPrepare.h"
 #include "llvm/CodeGen/ComplexDeinterleavingPass.h"
@@ -148,6 +149,7 @@
 #include "llvm/CodeGen/RegUsageInfoPropagate.h"
 #include "llvm/CodeGen/RegisterCoalescerPass.h"
 #include "llvm/CodeGen/RegisterUsageInfo.h"
+#include "llvm/CodeGen/RemoveLoadsIntoFakeUses.h"
 #include "llvm/CodeGen/RemoveRedundantDebugValues.h"
 #include "llvm/CodeGen/RenameIndependentSubregs.h"
 #include "llvm/CodeGen/SafeStack.h"
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 300898bb092b..f172271be09a 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -926,14 +926,14 @@ PassBuilder::buildInlinerPipeline(OptimizationLevel Level,
     IP = getInlineParamsFromOptLevel(Level);
   else
     IP = getInlineParams(PTO.InlinerThreshold);
-  // For PreLinkThinLTO + SamplePGO, set hot-caller threshold to 0 to
-  // disable hot callsite inline (as much as possible [1]) because it makes
+  // For PreLinkThinLTO + SamplePGO or PreLinkFullLTO + SamplePGO,
+  // set hot-caller threshold to 0 to disable hot
+  // callsite inline (as much as possible [1]) because it makes
   // profile annotation in the backend inaccurate.
   //
   // [1] Note the cost of a function could be below zero due to erased
   // prologue / epilogue.
-  if (Phase == ThinOrFullLTOPhase::ThinLTOPreLink && PGOOpt &&
-      PGOOpt->Action == PGOOptions::SampleUse)
+  if (isLTOPreLink(Phase) && PGOOpt && PGOOpt->Action == PGOOptions::SampleUse)
     IP.HotCallSiteThreshold = 0;
 
   if (PGOOpt)
@@ -1023,14 +1023,14 @@ PassBuilder::buildModuleInlinerPipeline(OptimizationLevel Level,
   ModulePassManager MPM;
 
   InlineParams IP = getInlineParamsFromOptLevel(Level);
-  // For PreLinkThinLTO + SamplePGO, set hot-caller threshold to 0 to
-  // disable hot callsite inline (as much as possible [1]) because it makes
+  // For PreLinkThinLTO + SamplePGO or PreLinkFullLTO + SamplePGO,
+  // set hot-caller threshold to 0 to disable hot
+  // callsite inline (as much as possible [1]) because it makes
   // profile annotation in the backend inaccurate.
   //
   // [1] Note the cost of a function could be below zero due to erased
   // prologue / epilogue.
-  if (Phase == ThinOrFullLTOPhase::ThinLTOPreLink && PGOOpt &&
-      PGOOpt->Action == PGOOptions::SampleUse)
+  if (isLTOPreLink(Phase) && PGOOpt && PGOOpt->Action == PGOOptions::SampleUse)
     IP.HotCallSiteThreshold = 0;
 
   if (PGOOpt)
diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp
index 4c59278eaaa0..dc1dd5d9c7f4 100644
--- a/llvm/lib/Passes/StandardInstrumentations.cpp
+++ b/llvm/lib/Passes/StandardInstrumentations.cpp
@@ -2001,14 +2001,12 @@ DotCfgDiff::DotCfgDiff(StringRef Title, const FuncDataT<DCData> &Before,
   for (auto &A : After.getData()) {
     StringRef Label = A.getKey();
     const BlockDataT<DCData> &BD = A.getValue();
-    unsigned C = NodePosition.count(Label);
-    if (C == 0)
+    auto It = NodePosition.find(Label);
+    if (It == NodePosition.end())
       // This only exists in the after IR.  Create the node.
       createNode(Label, BD, AfterColour);
-    else {
-      assert(C == 1 && "Unexpected multiple nodes.");
-      Nodes[NodePosition[Label]].setCommon(BD);
-    }
+    else
+      Nodes[It->second].setCommon(BD);
     // Add in the edges between the nodes (as common or only in after).
     for (StringMap<std::string>::const_iterator Sink = BD.getData().begin(),
                                                 E = BD.getData().end();
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index e058010f9d26..a7b9f259bfed 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -125,6 +125,9 @@ struct fltSemantics {
 
   /* Whether this semantics can represent signed values */
   bool hasSignedRepr = true;
+
+  /* Whether the sign bit of this semantics is the most significant bit */
+  bool hasSignBitInMSB = true;
 };
 
 static constexpr fltSemantics semIEEEhalf = {15, -14, 11, 16};
@@ -144,9 +147,15 @@ static constexpr fltSemantics semFloat8E4M3B11FNUZ = {
     4, -10, 4, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::NegativeZero};
 static constexpr fltSemantics semFloat8E3M4 = {3, -2, 5, 8};
 static constexpr fltSemantics semFloatTF32 = {127, -126, 11, 19};
-static constexpr fltSemantics semFloat8E8M0FNU = {
-    127,   -127, 1, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::AllOnes,
-    false, false};
+static constexpr fltSemantics semFloat8E8M0FNU = {127,
+                                                  -127,
+                                                  1,
+                                                  8,
+                                                  fltNonfiniteBehavior::NanOnly,
+                                                  fltNanEncoding::AllOnes,
+                                                  false,
+                                                  false,
+                                                  false};
 
 static constexpr fltSemantics semFloat6E3M2FN = {
     4, -2, 3, 6, fltNonfiniteBehavior::FiniteOnly};
@@ -358,6 +367,10 @@ bool APFloatBase::isIEEELikeFP(const fltSemantics &semantics) {
   return SemanticsToEnum(semantics) <= S_IEEEquad;
 }
 
+bool APFloatBase::hasSignBitInMSB(const fltSemantics &semantics) {
+  return semantics.hasSignBitInMSB;
+}
+
 bool APFloatBase::isRepresentableAsNormalIn(const fltSemantics &Src,
                                             const fltSemantics &Dst) {
   // Exponent range must be larger.
diff --git a/llvm/lib/TableGen/CMakeLists.txt b/llvm/lib/TableGen/CMakeLists.txt
index 84815c773699..0f9284c8bb99 100644
--- a/llvm/lib/TableGen/CMakeLists.txt
+++ b/llvm/lib/TableGen/CMakeLists.txt
@@ -7,6 +7,7 @@ add_llvm_component_library(LLVMTableGen
   Record.cpp
   SetTheory.cpp
   StringMatcher.cpp
+  StringToOffsetTable.cpp
   TableGenBackend.cpp
   TableGenBackendSkeleton.cpp
   TGLexer.cpp
diff --git a/llvm/lib/TableGen/Main.cpp b/llvm/lib/TableGen/Main.cpp
index 35600bf2f1f8..ea716215e067 100644
--- a/llvm/lib/TableGen/Main.cpp
+++ b/llvm/lib/TableGen/Main.cpp
@@ -64,6 +64,15 @@ WriteIfChanged("write-if-changed", cl::desc("Only write output if it changed"));
 static cl::opt<bool>
 TimePhases("time-phases", cl::desc("Time phases of parser and backend"));
 
+namespace llvm {
+cl::opt<bool> EmitLongStrLiterals(
+    "long-string-literals",
+    cl::desc("when emitting large string tables, prefer string literals over "
+             "comma-separated char literals. This can be a readability and "
+             "compile-time performance win, but upsets some compilers"),
+    cl::Hidden, cl::init(true));
+} // end namespace llvm
+
 static cl::opt<bool> NoWarnOnUnusedTemplateArgs(
     "no-warn-on-unused-template-args",
     cl::desc("Disable unused template argument warnings."));
diff --git a/llvm/lib/TableGen/StringToOffsetTable.cpp b/llvm/lib/TableGen/StringToOffsetTable.cpp
new file mode 100644
index 000000000000..d73b5749ad7d
--- /dev/null
+++ b/llvm/lib/TableGen/StringToOffsetTable.cpp
@@ -0,0 +1,120 @@
+//===- StringToOffsetTable.cpp - Emit a big concatenated string -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/TableGen/StringToOffsetTable.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/TableGen/Main.h"
+
+using namespace llvm;
+
+unsigned StringToOffsetTable::GetOrAddStringOffset(StringRef Str,
+                                                   bool appendZero) {
+  auto [II, Inserted] = StringOffset.insert({Str, size()});
+  if (Inserted) {
+    // Add the string to the aggregate if this is the first time found.
+    AggregateString.append(Str.begin(), Str.end());
+    if (appendZero)
+      AggregateString += '\0';
+  }
+
+  return II->second;
+}
+
+void StringToOffsetTable::EmitStringTableDef(raw_ostream &OS, const Twine &Name,
+                                             const Twine &Indent) const {
+  OS << formatv(R"(
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Woverlength-strings"
+#endif
+{0}static constexpr char {1}Storage[] = )",
+                Indent, Name);
+
+  // MSVC silently miscompiles string literals longer than 64k in some
+  // circumstances. The build system sets EmitLongStrLiterals to false when it
+  // detects that it is targetting MSVC. When that option is false and the
+  // string table is longer than 64k, emit it as an array of character
+  // literals.
+  bool UseChars = !EmitLongStrLiterals && AggregateString.size() > (64 * 1024);
+  OS << (UseChars ? "{\n" : "\n");
+
+  llvm::ListSeparator LineSep(UseChars ? ",\n" : "\n");
+  llvm::SmallVector<StringRef> Strings(split(AggregateString, '\0'));
+  // We should always have an empty string at the start, and because these are
+  // null terminators rather than separators, we'll have one at the end as
+  // well. Skip the end one.
+  assert(Strings.front().empty() && "Expected empty initial string!");
+  assert(Strings.back().empty() &&
+         "Expected empty string at the end due to terminators!");
+  Strings.pop_back();
+  for (StringRef Str : Strings) {
+    OS << LineSep << Indent << "  ";
+    // If we can, just emit this as a string literal to be concatenated.
+    if (!UseChars) {
+      OS << "\"";
+      OS.write_escaped(Str);
+      OS << "\\0\"";
+      continue;
+    }
+
+    llvm::ListSeparator CharSep(", ");
+    for (char C : Str) {
+      OS << CharSep << "'";
+      OS.write_escaped(StringRef(&C, 1));
+      OS << "'";
+    }
+    OS << CharSep << "'\\0'";
+  }
+  OS << LineSep << Indent << (UseChars ? "};" : "  ;");
+
+  OS << formatv(R"(
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+{0}static constexpr llvm::StringTable {1} =
+{0}    {1}Storage;
+)",
+                Indent, Name);
+}
+
+void StringToOffsetTable::EmitString(raw_ostream &O) const {
+  // Escape the string.
+  SmallString<256> EscapedStr;
+  raw_svector_ostream(EscapedStr).write_escaped(AggregateString);
+
+  O << "    \"";
+  unsigned CharsPrinted = 0;
+  for (unsigned i = 0, e = EscapedStr.size(); i != e; ++i) {
+    if (CharsPrinted > 70) {
+      O << "\"\n    \"";
+      CharsPrinted = 0;
+    }
+    O << EscapedStr[i];
+    ++CharsPrinted;
+
+    // Print escape sequences all together.
+    if (EscapedStr[i] != '\\')
+      continue;
+
+    assert(i + 1 < EscapedStr.size() && "Incomplete escape sequence!");
+    if (isDigit(EscapedStr[i + 1])) {
+      assert(isDigit(EscapedStr[i + 2]) && isDigit(EscapedStr[i + 3]) &&
+             "Expected 3 digit octal escape!");
+      O << EscapedStr[++i];
+      O << EscapedStr[++i];
+      O << EscapedStr[++i];
+      CharsPrinted += 3;
+    } else {
+      O << EscapedStr[++i];
+      ++CharsPrinted;
+    }
+  }
+  O << "\"";
+}
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 6bf6ce716783..68218e59961c 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -3980,7 +3980,7 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
       }
 
     if (!InsertBeforeLR)
-      CSI.insert(CSI.end(), VGSaves.begin(), VGSaves.end());
+      llvm::append_range(CSI, VGSaves);
   }
 
   Register LastReg = 0;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index d370f8c7ff6e..74217fad82a7 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -2760,6 +2760,9 @@ bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {
   case AArch64::LDRXpre:
   case AArch64::LDURSWi:
   case AArch64::LDRSWpre:
+  // SVE instructions.
+  case AArch64::LDR_ZXI:
+  case AArch64::STR_ZXI:
     return true;
   }
 }
@@ -2912,6 +2915,18 @@ bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
       return false;
   }
 
+  // Pairing SVE fills/spills is only valid for little-endian targets that
+  // implement VLS 128.
+  switch (MI.getOpcode()) {
+  default:
+    break;
+  case AArch64::LDR_ZXI:
+  case AArch64::STR_ZXI:
+    if (!Subtarget.isLittleEndian() ||
+        Subtarget.getSVEVectorSizeInBits() != 128)
+      return false;
+  }
+
   // Check if this load/store has a hint to avoid pair formation.
   // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
   if (isLdStPairSuppressed(MI))
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 06e633effe87..7c47492cf1a8 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -298,6 +298,7 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc,
   case AArch64::STRXui:
   case AArch64::STRXpre:
   case AArch64::STURXi:
+  case AArch64::STR_ZXI:
   case AArch64::LDRDui:
   case AArch64::LDURDi:
   case AArch64::LDRDpre:
@@ -316,6 +317,7 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc,
   case AArch64::LDRSui:
   case AArch64::LDURSi:
   case AArch64::LDRSpre:
+  case AArch64::LDR_ZXI:
     return Opc;
   case AArch64::LDRSWui:
     return AArch64::LDRWui;
@@ -361,6 +363,7 @@ static unsigned getMatchingPairOpcode(unsigned Opc) {
     return AArch64::STPDpre;
   case AArch64::STRQui:
   case AArch64::STURQi:
+  case AArch64::STR_ZXI:
     return AArch64::STPQi;
   case AArch64::STRQpre:
     return AArch64::STPQpre;
@@ -386,6 +389,7 @@ static unsigned getMatchingPairOpcode(unsigned Opc) {
     return AArch64::LDPDpre;
   case AArch64::LDRQui:
   case AArch64::LDURQi:
+  case AArch64::LDR_ZXI:
     return AArch64::LDPQi;
   case AArch64::LDRQpre:
     return AArch64::LDPQpre;
@@ -1225,6 +1229,16 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
     (void)MIBSXTW;
     LLVM_DEBUG(dbgs() << "  Extend operand:\n    ");
     LLVM_DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
+  } else if (Opc == AArch64::LDR_ZXI || Opc == AArch64::STR_ZXI) {
+    // We are combining SVE fill/spill to LDP/STP, so we need to use the Q
+    // variant of the registers.
+    MachineOperand &MOp0 = MIB->getOperand(0);
+    MachineOperand &MOp1 = MIB->getOperand(1);
+    assert(AArch64::ZPRRegClass.contains(MOp0.getReg()) &&
+           AArch64::ZPRRegClass.contains(MOp1.getReg()) && "Invalid register.");
+    MOp0.setReg(AArch64::Q0 + (MOp0.getReg() - AArch64::Z0));
+    MOp1.setReg(AArch64::Q0 + (MOp1.getReg() - AArch64::Z0));
+    LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
   } else {
     LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
   }
@@ -1499,6 +1513,12 @@ static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
   if (OpcA == OpcB)
     return !AArch64InstrInfo::isPreLdSt(FirstMI);
 
+  // Bail out if one of the opcodes is SVE fill/spill, as we currently don't
+  // allow pairing them with other instructions.
+  if (OpcA == AArch64::LDR_ZXI || OpcA == AArch64::STR_ZXI ||
+      OpcB == AArch64::LDR_ZXI || OpcB == AArch64::STR_ZXI)
+    return false;
+
   // Two pre ld/st of different opcodes cannot be merged either
   if (AArch64InstrInfo::isPreLdSt(FirstMI) && AArch64InstrInfo::isPreLdSt(MI))
     return false;
@@ -2659,7 +2679,8 @@ bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
       // Get the needed alignments to check them if
       // ldp-aligned-only/stp-aligned-only features are opted.
       uint64_t MemAlignment = MemOp->getAlign().value();
-      uint64_t TypeAlignment = Align(MemOp->getSize().getValue()).value();
+      uint64_t TypeAlignment =
+          Align(MemOp->getSize().getValue().getKnownMinValue()).value();
 
       if (MemAlignment < 2 * TypeAlignment) {
         NumFailedAlignmentCheck++;
@@ -2820,11 +2841,18 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
     }
   // 3) Find loads and stores that can be merged into a single load or store
   //    pair instruction.
+  //    When compiling for SVE 128, also try to combine SVE fill/spill
+  //    instructions into LDP/STP.
   //      e.g.,
   //        ldr x0, [x2]
   //        ldr x1, [x2, #8]
   //        ; becomes
   //        ldp x0, x1, [x2]
+  //      e.g.,
+  //        ldr z0, [x2]
+  //        ldr z1, [x2, #1, mul vl]
+  //        ; becomes
+  //        ldp q0, q1, [x2]
 
   if (MBB.getParent()->getRegInfo().tracksLiveness()) {
     DefinedInBB.clear();
diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index 54347b610c50..0ddd17cee134 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -261,15 +261,18 @@ bool AArch64MIPeepholeOpt::visitORR(MachineInstr &MI) {
     // A COPY from an FPR will become a FMOVSWr, so do so now so that we know
     // that the upper bits are zero.
     if (RC != &AArch64::FPR32RegClass &&
-        ((RC != &AArch64::FPR64RegClass && RC != &AArch64::FPR128RegClass) ||
+        ((RC != &AArch64::FPR64RegClass && RC != &AArch64::FPR128RegClass &&
+          RC != &AArch64::ZPRRegClass) ||
          SrcMI->getOperand(1).getSubReg() != AArch64::ssub))
       return false;
-    Register CpySrc = SrcMI->getOperand(1).getReg();
+    Register CpySrc;
     if (SrcMI->getOperand(1).getSubReg() == AArch64::ssub) {
       CpySrc = MRI->createVirtualRegister(&AArch64::FPR32RegClass);
       BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(),
               TII->get(TargetOpcode::COPY), CpySrc)
           .add(SrcMI->getOperand(1));
+    } else {
+      CpySrc = SrcMI->getOperand(1).getReg();
     }
     BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(),
             TII->get(AArch64::FMOVSWr), SrcMI->getOperand(0).getReg())
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index ca1a48690195..2b9d32f9208f 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1492,9 +1492,8 @@ static bool isAllActivePredicate(Value *Pred) {
     if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
         cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
       Pred = UncastedPred;
-
-  return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
-                         m_ConstantInt<AArch64SVEPredPattern::all>()));
+  auto *C = dyn_cast<Constant>(Pred);
+  return (C && C->isAllOnesValue());
 }
 
 // Use SVE intrinsic info to eliminate redundant operands and/or canonicalise
@@ -1701,14 +1700,7 @@ static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
                                                         IntrinsicInst &II) {
   LLVMContext &Ctx = II.getContext();
 
-  // Check that the predicate is all active
-  auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
-  if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
-    return std::nullopt;
-
-  const auto PTruePattern =
-      cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
-  if (PTruePattern != AArch64SVEPredPattern::all)
+  if (!isAllActivePredicate(II.getArgOperand(0)))
     return std::nullopt;
 
   // Check that we have a compare of zero..
@@ -2118,8 +2110,7 @@ instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) {
   auto *OpPredicate = II.getOperand(0);
   auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
   if (BinOpCode == Instruction::BinaryOpsEnd ||
-      !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
-                              m_ConstantInt<AArch64SVEPredPattern::all>())))
+      !isAllActivePredicate(OpPredicate))
     return std::nullopt;
   auto BinOp = IC.Builder.CreateBinOpFMF(
       BinOpCode, II.getOperand(1), II.getOperand(2), II.getFastMathFlags());
@@ -2641,6 +2632,13 @@ static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
   return std::nullopt;
 }
 
+static std::optional<Instruction *> instCombinePTrue(InstCombiner &IC,
+                                                     IntrinsicInst &II) {
+  if (match(II.getOperand(0), m_ConstantInt<AArch64SVEPredPattern::all>()))
+    return IC.replaceInstUsesWith(II, Constant::getAllOnesValue(II.getType()));
+  return std::nullopt;
+}
+
 std::optional<Instruction *>
 AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
                                      IntrinsicInst &II) const {
@@ -2744,6 +2742,8 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
     return instCombineSVEDupqLane(IC, II);
   case Intrinsic::aarch64_sve_insr:
     return instCombineSVEInsr(IC, II);
+  case Intrinsic::aarch64_sve_ptrue:
+    return instCombinePTrue(IC, II);
   }
 
   return std::nullopt;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 7b4d00c8214c..153b14ce6050 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -1491,12 +1491,10 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
                       return isBitPack(Opc);
                     });
 
-  auto *PackPred =
-      std::find_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(),
-                   [&isBitPack](SDep &Pred) {
-                     auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
-                     return isBitPack(Opc);
-                   });
+  auto *PackPred = llvm::find_if((*TempMFMA)->Preds, [&isBitPack](SDep &Pred) {
+    auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
+    return isBitPack(Opc);
+  });
 
   if (PackPred == (*TempMFMA)->Preds.end())
     return false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 94ecb6ba9a2b..6c01f6dd370f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -729,6 +729,11 @@ static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy,
   // complicated.
   if (isa<FixedVectorType>(AccessTy)) {
     TypeSize AccTS = DL.getTypeStoreSize(AccessTy);
+    // If the type size and the store size don't match, we would need to do more
+    // than just bitcast to translate between an extracted/insertable subvectors
+    // and the accessed value.
+    if (AccTS * 8 != DL.getTypeSizeInBits(AccessTy))
+      return false;
     TypeSize VecTS = DL.getTypeStoreSize(VecTy->getElementType());
     return AccTS.isKnownMultipleOf(VecTS);
   }
@@ -813,15 +818,17 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
 
     if (VectorType::isValidElementType(ElemTy) && NumElems > 0) {
       unsigned ElementSize = DL->getTypeSizeInBits(ElemTy) / 8;
-      unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
-      // Expand vector if required to match padding of inner type,
-      // i.e. odd size subvectors.
-      // Storage size of new vector must match that of alloca for correct
-      // behaviour of byte offsets and GEP computation.
-      if (NumElems * ElementSize != AllocaSize)
-        NumElems = AllocaSize / ElementSize;
-      if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
-        VectorTy = FixedVectorType::get(ElemTy, NumElems);
+      if (ElementSize > 0) {
+        unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
+        // Expand vector if required to match padding of inner type,
+        // i.e. odd size subvectors.
+        // Storage size of new vector must match that of alloca for correct
+        // behaviour of byte offsets and GEP computation.
+        if (NumElems * ElementSize != AllocaSize)
+          NumElems = AllocaSize / ElementSize;
+        if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
+          VectorTy = FixedVectorType::get(ElemTy, NumElems);
+      }
     }
   }
 
@@ -861,7 +868,14 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
   LLVM_DEBUG(dbgs() << "  Attempting promotion to: " << *VectorTy << "\n");
 
   Type *VecEltTy = VectorTy->getElementType();
-  unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8;
+  unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy);
+  if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) {
+    LLVM_DEBUG(dbgs() << "  Cannot convert to vector if the allocation size "
+                         "does not match the type's size\n");
+    return false;
+  }
+  unsigned ElementSize = ElementSizeInBits / 8;
+  assert(ElementSize > 0);
   for (auto *U : Uses) {
     Instruction *Inst = cast<Instruction>(U->getUser());
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
index 43885587ad81..ca093be61d11 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
@@ -1081,8 +1081,7 @@ void AMDGPUSwLowerLDS::lowerNonKernelLDSAccesses(
       IRB.CreateLoad(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), BaseLoad);
 
   for (GlobalVariable *GV : LDSGlobals) {
-    const auto *GVIt =
-        std::find(OrdereLDSGlobals.begin(), OrdereLDSGlobals.end(), GV);
+    const auto *GVIt = llvm::find(OrdereLDSGlobals, GV);
     assert(GVIt != OrdereLDSGlobals.end());
     uint32_t GVOffset = std::distance(OrdereLDSGlobals.begin(), GVIt);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 8f3138acaea0..b59e94085272 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -66,6 +66,7 @@
 #include "llvm/Analysis/KernelInfo.h"
 #include "llvm/Analysis/UniformityAnalysis.h"
 #include "llvm/CodeGen/AtomicExpand.h"
+#include "llvm/CodeGen/BranchRelaxation.h"
 #include "llvm/CodeGen/DeadMachineInstructionElim.h"
 #include "llvm/CodeGen/GlobalISel/CSEInfo.h"
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
@@ -2205,7 +2206,7 @@ void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const {
     addPass(AMDGPUInsertDelayAluPass());
   }
 
-  // TODO: addPass(BranchRelaxationPass());
+  addPass(BranchRelaxationPass());
 }
 
 bool AMDGPUCodeGenPassBuilder::isPassEnabled(const cl::opt<bool> &Opt,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 356040da9567..bd95bcd89e18 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4055,8 +4055,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
   }
 
   if (IsChainCallConv)
-    Ops.insert(Ops.end(), ChainCallSpecialArgs.begin(),
-               ChainCallSpecialArgs.end());
+    llvm::append_range(Ops, ChainCallSpecialArgs);
 
   // Add argument registers to the end of the list so that they are known live
   // into the call.
@@ -15526,9 +15525,9 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
 
   // Adjust the writemask in the node
   SmallVector<SDValue, 12> Ops;
-  Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
+  llvm::append_range(Ops, Node->ops().take_front(DmaskIdx));
   Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
-  Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
+  llvm::append_range(Ops, Node->ops().drop_front(DmaskIdx + 1));
 
   MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
 
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index efdf642e29db..1673bfa15267 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -352,7 +352,7 @@ void SIMachineFunctionInfo::shiftWwmVGPRsToLowestRange(
 
     // Replace the register in SpillPhysVGPRs. This is needed to look for free
     // lanes while spilling special SGPRs like FP, BP, etc. during PEI.
-    auto *RegItr = std::find(SpillPhysVGPRs.begin(), SpillPhysVGPRs.end(), Reg);
+    auto *RegItr = llvm::find(SpillPhysVGPRs, Reg);
     if (RegItr != SpillPhysVGPRs.end()) {
       unsigned Idx = std::distance(SpillPhysVGPRs.begin(), RegItr);
       SpillPhysVGPRs[Idx] = NewReg;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 67b44d2a19ef..0611a97a3cdc 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -624,7 +624,9 @@ bool isMAC(unsigned Opc) {
          Opc == AMDGPU::V_FMAC_LEGACY_F32_e64_gfx10 ||
          Opc == AMDGPU::V_FMAC_DX9_ZERO_F32_e64_gfx11 ||
          Opc == AMDGPU::V_FMAC_F16_e64_gfx10 ||
+         Opc == AMDGPU::V_FMAC_F16_t16_e64_gfx11 ||
          Opc == AMDGPU::V_FMAC_F16_fake16_e64_gfx11 ||
+         Opc == AMDGPU::V_FMAC_F16_t16_e64_gfx12 ||
          Opc == AMDGPU::V_FMAC_F16_fake16_e64_gfx12 ||
          Opc == AMDGPU::V_DOT2C_F32_F16_e64_vi ||
          Opc == AMDGPU::V_DOT2C_F32_BF16_e64_vi ||
diff --git a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 2d45c42e724d..f8790dd063ae 100644
--- a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -2849,12 +2849,10 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
         //  Erase the entry into the DbgValueSinkCandidates for the DBG_VALUE
         //  that was moved.
         auto DbgVar = createDebugVariableFromMachineInstr(DbgInstr);
-        auto DbgIt = DbgValueSinkCandidates.find(DbgVar);
-        // If the instruction is a DBG_VALUE_LIST, it may have already been
-        // erased from the DbgValueSinkCandidates. Only erase if it exists in
-        // the DbgValueSinkCandidates.
-        if (DbgIt != DbgValueSinkCandidates.end())
-          DbgValueSinkCandidates.erase(DbgIt);
+        // Erase DbgVar from DbgValueSinkCandidates if still present.  If the
+        // instruction is a DBG_VALUE_LIST, it may have already been erased from
+        // DbgValueSinkCandidates.
+        DbgValueSinkCandidates.erase(DbgVar);
         // Zero out original dbg instr
         forEachDbgRegOperand(DbgInstr,
                              [&](MachineOperand &Op) { Op.setReg(0); });
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index bed15bdc274b..88bcf4cc9c6c 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -335,8 +335,36 @@ const char *ARMAsmBackend::reasonForFixupRelaxation(const MCFixup &Fixup,
   return nullptr;
 }
 
-bool ARMAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
-                                         uint64_t Value) const {
+static bool needsInterworking(const MCAssembler &Asm, const MCSymbol *Sym,
+                              unsigned FixupKind) {
+  // Create relocations for unconditional branches to function symbols with
+  // different execution mode in ELF binaries.
+  if (!Sym || !Sym->isELF())
+    return false;
+  unsigned Type = cast<MCSymbolELF>(Sym)->getType();
+  if ((Type == ELF::STT_FUNC || Type == ELF::STT_GNU_IFUNC)) {
+    if (Asm.isThumbFunc(Sym) && (FixupKind == ARM::fixup_arm_uncondbranch))
+      return true;
+    if (!Asm.isThumbFunc(Sym) && (FixupKind == ARM::fixup_arm_thumb_br ||
+                                  FixupKind == ARM::fixup_arm_thumb_bl ||
+                                  FixupKind == ARM::fixup_t2_condbranch ||
+                                  FixupKind == ARM::fixup_t2_uncondbranch))
+      return true;
+  }
+  return false;
+}
+
+bool ARMAsmBackend::fixupNeedsRelaxationAdvanced(const MCAssembler &Asm,
+                                                 const MCFixup &Fixup,
+                                                 const MCValue &Target,
+                                                 uint64_t Value,
+                                                 bool Resolved) const {
+  const MCSymbol *Sym = Target.getAddSym();
+  if (needsInterworking(Asm, Sym, Fixup.getTargetKind()))
+    return true;
+
+  if (!Resolved)
+    return true;
   return reasonForFixupRelaxation(Fixup, Value);
 }
 
@@ -973,18 +1001,8 @@ bool ARMAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
   }
   // Create relocations for unconditional branches to function symbols with
   // different execution mode in ELF binaries.
-  if (Sym && Sym->isELF()) {
-    unsigned Type = cast<MCSymbolELF>(Sym)->getType();
-    if ((Type == ELF::STT_FUNC || Type == ELF::STT_GNU_IFUNC)) {
-      if (Asm.isThumbFunc(Sym) && (FixupKind == ARM::fixup_arm_uncondbranch))
-        return true;
-      if (!Asm.isThumbFunc(Sym) && (FixupKind == ARM::fixup_arm_thumb_br ||
-                                    FixupKind == ARM::fixup_arm_thumb_bl ||
-                                    FixupKind == ARM::fixup_t2_condbranch ||
-                                    FixupKind == ARM::fixup_t2_uncondbranch))
-        return true;
-    }
-  }
+  if (needsInterworking(Asm, Sym, Fixup.getTargetKind()))
+    return true;
   // We must always generate a relocation for BL/BLX instructions if we have
   // a symbol to reference, as the linker relies on knowing the destination
   // symbol's thumb-ness to get interworking right.
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
index 4e4df16d890c..57588d989d54 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
@@ -54,8 +54,9 @@ public:
   const char *reasonForFixupRelaxation(const MCFixup &Fixup,
                                        uint64_t Value) const;
 
-  bool fixupNeedsRelaxation(const MCFixup &Fixup,
-                            uint64_t Value) const override;
+  bool fixupNeedsRelaxationAdvanced(const MCAssembler &,
+                                    const MCFixup &, const MCValue &, uint64_t,
+                                    bool) const override;
 
   void relaxInstruction(MCInst &Inst,
                         const MCSubtargetInfo &STI) const override;
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
index ec11a78f8a7a..9d35626f449d 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMUNWINDOPASM_H
 #define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMUNWINDOPASM_H
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include <cstddef>
 #include <cstdint>
@@ -60,7 +61,7 @@ public:
 
   /// Emit unwind raw opcodes
   void EmitRaw(const SmallVectorImpl<uint8_t> &Opcodes) {
-    Ops.insert(Ops.end(), Opcodes.begin(), Opcodes.end());
+    llvm::append_range(Ops, Opcodes);
     OpBegins.push_back(OpBegins.back() + Opcodes.size());
   }
 
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp
index ea7968f01ee4..792a55555a0d 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp
@@ -171,16 +171,13 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   }
 }
 
-bool CSKYAsmBackend::fixupNeedsRelaxationAdvanced(const MCAssembler &Asm,
+bool CSKYAsmBackend::fixupNeedsRelaxationAdvanced(const MCAssembler &,
                                                   const MCFixup &Fixup,
-                                                  bool Resolved, uint64_t Value,
-                                                  const MCRelaxableFragment *DF,
-                                                  const bool WasForced) const {
-  // Return true if the symbol is actually unresolved.
-  // Resolved could be always false when shouldForceRelocation return true.
-  // We use !WasForced to indicate that the symbol is unresolved and not forced
-  // by shouldForceRelocation.
-  if (!Resolved && !WasForced)
+                                                  const MCValue &,
+                                                  uint64_t Value,
+                                                  bool Resolved) const {
+  // Return true if the symbol is unresolved.
+  if (!Resolved)
     return true;
 
   int64_t Offset = int64_t(Value);
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h
index 07c5065ea4b5..3ce2f37212dc 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h
@@ -39,11 +39,9 @@ public:
   bool mayNeedRelaxation(const MCInst &Inst,
                          const MCSubtargetInfo &STI) const override;
 
-  bool fixupNeedsRelaxationAdvanced(const MCAssembler &Asm,
-                                    const MCFixup &Fixup, bool Resolved,
-                                    uint64_t Value,
-                                    const MCRelaxableFragment *DF,
-                                    const bool WasForced) const override;
+  bool fixupNeedsRelaxationAdvanced(const MCAssembler &,
+                                    const MCFixup &, const MCValue &, uint64_t,
+                                    bool) const override;
 
   bool writeNopData(raw_ostream &OS, uint64_t Count,
                     const MCSubtargetInfo *STI) const override;
diff --git a/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp b/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp
index 0c31e274a4ee..71bdfc6657c5 100644
--- a/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp
@@ -160,11 +160,8 @@ bool HexagonTfrCleanup::updateImmMap(MachineInstr *MI, ImmediateMap &IMap) {
     if (!Mo->isReg() || !Mo->isDef())
       continue;
     unsigned R = Mo->getReg();
-    for (MCRegAliasIterator AR(R, TRI, true); AR.isValid(); ++AR) {
-      ImmediateMap::iterator F = IMap.find(*AR);
-      if (F != IMap.end())
-        IMap.erase(F);
-    }
+    for (MCRegAliasIterator AR(R, TRI, true); AR.isValid(); ++AR)
+      IMap.erase(*AR);
   }
   return true;
 }
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index 1c1454a41cc0..ae1dac57d929 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -41,6 +41,7 @@ class HexagonAsmBackend : public MCAsmBackend {
   uint8_t OSABI;
   StringRef CPU;
   mutable uint64_t relaxedCnt;
+  mutable const MCInst *RelaxedMCB = nullptr;
   std::unique_ptr <MCInstrInfo> MCII;
   std::unique_ptr <MCInst *> RelaxTarget;
   MCInst * Extender;
@@ -560,17 +561,17 @@ public:
   /// \param Inst - The instruction to test.
   bool mayNeedRelaxation(MCInst const &Inst,
                          const MCSubtargetInfo &STI) const override {
+    RelaxedMCB = &Inst;
     return true;
   }
 
   /// fixupNeedsRelaxation - Target specific predicate for whether a given
   /// fixup requires the associated instruction to be relaxed.
   bool fixupNeedsRelaxationAdvanced(const MCAssembler &Asm,
-                                    const MCFixup &Fixup, bool Resolved,
+                                    const MCFixup &Fixup, const MCValue &,
                                     uint64_t Value,
-                                    const MCRelaxableFragment *DF,
-                                    const bool WasForced) const override {
-    MCInst const &MCB = DF->getInst();
+                                    bool Resolved) const override {
+    MCInst const &MCB = *RelaxedMCB;
     assert(HexagonMCInstrInfo::isBundle(MCB));
 
     *RelaxTarget = nullptr;
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index 5f7974ab6cae..0c295997ab52 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -602,14 +602,6 @@ bool MipsAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
   }
 }
 
-bool MipsAsmBackend::isMicroMips(const MCSymbol *Sym) const {
-  if (const auto *ElfSym = dyn_cast<const MCSymbolELF>(Sym)) {
-    if (ElfSym->getOther() & ELF::STO_MIPS_MICROMIPS)
-      return true;
-  }
-  return false;
-}
-
 namespace {
 
 class WindowsMipsAsmBackend : public MipsAsmBackend {
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
index 9752615422b6..1e8504aaf2aa 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
@@ -54,8 +54,6 @@ public:
   bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
                              const MCValue &Target,
                              const MCSubtargetInfo *STI) override;
-
-  bool isMicroMips(const MCSymbol *Sym) const override;
 }; // class MipsAsmBackend
 
 } // namespace
diff --git a/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h b/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h
index d9beab7ec42e..8feae341893a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h
@@ -50,8 +50,7 @@ public:
   /// Check if the symbol has a mapping. Having a mapping means the handle is
   /// replaced with a reference
   bool checkImageHandleSymbol(StringRef Symbol) const {
-    return ImageHandleList.end() !=
-           std::find(ImageHandleList.begin(), ImageHandleList.end(), Symbol);
+    return llvm::is_contained(ImageHandleList, Symbol);
   }
 };
 }
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 55f1a90b2a01..f246aee4da39 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -539,17 +539,16 @@ public:
   // True if operand is a symbol with no modifiers, or a constant with no
   // modifiers and isShiftedInt<N-1, 1>(Op).
   template <int N> bool isBareSimmNLsb0() const {
-    int64_t Imm;
-    RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None;
     if (!isImm())
       return false;
-    bool IsConstantImm = evaluateConstantImm(getImm(), Imm);
-    bool IsValid;
-    if (!IsConstantImm)
-      IsValid = RISCVAsmParser::classifySymbolRef(getImm(), VK);
-    else
-      IsValid = isShiftedInt<N - 1, 1>(fixImmediateForRV32(Imm, isRV64Imm()));
-    return IsValid && VK == RISCVMCExpr::VK_None;
+
+    int64_t Imm;
+    if (evaluateConstantImm(getImm(), Imm))
+      return isShiftedInt<N - 1, 1>(fixImmediateForRV32(Imm, isRV64Imm()));
+
+    RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None;
+    return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
+           VK == RISCVMCExpr::VK_None;
   }
 
   // True if operand is a symbol with no modifiers, or a constant with no
@@ -2079,9 +2078,6 @@ ParseStatus RISCVAsmParser::parseBareSymbol(OperandVector &Operands) {
 
   SMLoc E = SMLoc::getFromPointer(S.getPointer() + Identifier.size());
 
-  if (Identifier.consume_back("@plt"))
-    return Error(getLoc(), "'@plt' operand not valid for instruction");
-
   MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier);
 
   if (Sym->isVariable()) {
@@ -2129,8 +2125,9 @@ ParseStatus RISCVAsmParser::parseCallSymbol(OperandVector &Operands) {
     Lex();
     Lex();
     StringRef PLT;
+    SMLoc Loc = getLoc();
     if (getParser().parseIdentifier(PLT) || PLT != "plt")
-      return ParseStatus::Failure;
+      return Error(Loc, "@ (except the deprecated/ignored @plt) is disallowed");
   } else if (!getLexer().peekTok().is(AsmToken::EndOfStatement)) {
     // Avoid parsing the register in `call rd, foo` as a call symbol.
     return ParseStatus::NoMatch;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index 49c8c6957aa3..b36b8bd3fb43 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -141,20 +141,19 @@ bool RISCVAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
   return STI->hasFeature(RISCV::FeatureRelax) || ForceRelocs;
 }
 
-bool RISCVAsmBackend::fixupNeedsRelaxationAdvanced(
-    const MCAssembler &Asm, const MCFixup &Fixup, bool Resolved, uint64_t Value,
-    const MCRelaxableFragment *DF, const bool WasForced) const {
+bool RISCVAsmBackend::fixupNeedsRelaxationAdvanced(const MCAssembler &,
+                                                   const MCFixup &Fixup,
+                                                   const MCValue &,
+                                                   uint64_t Value,
+                                                   bool Resolved) const {
   if (!RelaxBranches)
     return false;
 
   int64_t Offset = int64_t(Value);
   unsigned Kind = Fixup.getTargetKind();
 
-  // Return true if the symbol is actually unresolved.
-  // Resolved could be always false when shouldForceRelocation return true.
-  // We use !WasForced to indicate that the symbol is unresolved and not forced
-  // by shouldForceRelocation.
-  if (!Resolved && !WasForced)
+  // Return true if the symbol is unresolved.
+  if (!Resolved)
     return true;
 
   switch (Kind) {
@@ -594,12 +593,9 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   }
 }
 
-bool RISCVAsmBackend::evaluateTargetFixup(const MCAssembler &Asm,
-                                          const MCFixup &Fixup,
-                                          const MCFragment *DF,
-                                          const MCValue &Target,
-                                          const MCSubtargetInfo *STI,
-                                          uint64_t &Value, bool &WasForced) {
+bool RISCVAsmBackend::evaluateTargetFixup(
+    const MCAssembler &Asm, const MCFixup &Fixup, const MCFragment *DF,
+    const MCValue &Target, const MCSubtargetInfo *STI, uint64_t &Value) {
   const MCFixup *AUIPCFixup;
   const MCFragment *AUIPCDF;
   MCValue AUIPCTarget;
@@ -646,12 +642,7 @@ bool RISCVAsmBackend::evaluateTargetFixup(const MCAssembler &Asm,
   Value = Asm.getSymbolOffset(SA) + AUIPCTarget.getConstant();
   Value -= Asm.getFragmentOffset(*AUIPCDF) + AUIPCFixup->getOffset();
 
-  if (shouldForceRelocation(Asm, *AUIPCFixup, AUIPCTarget, STI)) {
-    WasForced = true;
-    return false;
-  }
-
-  return true;
+  return !shouldForceRelocation(Asm, *AUIPCFixup, AUIPCTarget, STI);
 }
 
 bool RISCVAsmBackend::handleAddSubRelocations(const MCAssembler &Asm,
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
index f5e8d340d9bc..5d585b4efc11 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
@@ -49,8 +49,8 @@ public:
 
   bool evaluateTargetFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                            const MCFragment *DF, const MCValue &Target,
-                           const MCSubtargetInfo *STI, uint64_t &Value,
-                           bool &WasForced) override;
+                           const MCSubtargetInfo *STI,
+                           uint64_t &Value) override;
 
   bool handleAddSubRelocations(const MCAssembler &Asm, const MCFragment &F,
                                const MCFixup &Fixup, const MCValue &Target,
@@ -68,12 +68,9 @@ public:
                              const MCValue &Target,
                              const MCSubtargetInfo *STI) override;
 
-  bool fixupNeedsRelaxationAdvanced(const MCAssembler &Asm,
-                                    const MCFixup &Fixup, bool Resolved,
-                                    uint64_t Value,
-                                    const MCRelaxableFragment *DF,
-                                    const bool WasForced) const override;
-
+  bool fixupNeedsRelaxationAdvanced(const MCAssembler &,
+                                    const MCFixup &, const MCValue &, uint64_t,
+                                    bool) const override;
 
   std::optional<MCFixupKind> getFixupKind(StringRef Name) const override;
 
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
index 83ecf805489c..972bee5a0aa7 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
@@ -34,6 +34,10 @@ static cl::opt<bool>
               cl::desc("Disable the emission of assembler pseudo instructions"),
               cl::init(false), cl::Hidden);
 
+static cl::opt<bool> EmitX8AsFP("riscv-emit-x8-as-fp",
+                                cl::desc("Emit x8 as fp instead of s0"),
+                                cl::init(false), cl::Hidden);
+
 // Print architectural register names rather than the ABI names (such as x2
 // instead of sp).
 // TODO: Make RISCVInstPrinter::getRegisterName non-static so that this can a
@@ -54,6 +58,11 @@ bool RISCVInstPrinter::applyTargetSpecificCLOption(StringRef Opt) {
     ArchRegNames = true;
     return true;
   }
+  if (Opt == "emit-x8-as-fp") {
+    if (!ArchRegNames)
+      EmitX8AsFP = true;
+    return true;
+  }
 
   return false;
 }
@@ -311,6 +320,13 @@ void RISCVInstPrinter::printVMaskReg(const MCInst *MI, unsigned OpNo,
 }
 
 const char *RISCVInstPrinter::getRegisterName(MCRegister Reg) {
+  // When PrintAliases is enabled, and EmitX8AsFP is enabled, x8 will be printed
+  // as fp instead of s0. Note that these similar registers are not replaced:
+  // - X8_H: used for f16 register in zhinx
+  // - X8_W: used for f32 register in zfinx
+  // - X8_X9: used for GPR Pair
+  if (!ArchRegNames && EmitX8AsFP && Reg == RISCV::X8)
+    return "fp";
   return getRegisterName(Reg, ArchRegNames ? RISCV::NoRegAltName
                                            : RISCV::ABIRegAltName);
 }
diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
index 53e88aa48556..86702bbe58f0 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
@@ -125,8 +125,7 @@ bool SPIRVExtensionsParser::parse(cl::Option &O, StringRef ArgName,
     if (Token.starts_with("+")) {
       EnabledExtensions.insert(NameValuePair->second);
     } else if (EnabledExtensions.count(NameValuePair->second)) {
-      if (std::find(Tokens.begin(), Tokens.end(), "+" + ExtensionName.str()) !=
-          Tokens.end())
+      if (llvm::is_contained(Tokens, "+" + ExtensionName.str()))
         return O.error(
             "Extension cannot be allowed and disallowed at the same time: " +
             ExtensionName.str());
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp
index 156b40eb43a3..68b24dbe9f00 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp
@@ -55,7 +55,7 @@ void WebAssemblyAsmTypeCheck::funcDecl(const wasm::WasmSignature &Sig) {
 
 void WebAssemblyAsmTypeCheck::localDecl(
     const SmallVectorImpl<wasm::ValType> &Locals) {
-  LocalTypes.insert(LocalTypes.end(), Locals.begin(), Locals.end());
+  llvm::append_range(LocalTypes, Locals);
 }
 
 void WebAssemblyAsmTypeCheck::dumpTypeStack(Twine Msg) {
@@ -357,8 +357,7 @@ bool WebAssemblyAsmTypeCheck::checkTryTable(SMLoc ErrorLoc,
         Opcode == wasm::WASM_OPCODE_CATCH_REF) {
       if (!getSignature(ErrorLoc, Inst.getOperand(OpIdx++),
                         wasm::WASM_SYMBOL_TYPE_TAG, Sig))
-        SentTypes.insert(SentTypes.end(), Sig->Params.begin(),
-                         Sig->Params.end());
+        llvm::append_range(SentTypes, Sig->Params);
       else
         Error = true;
     }
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 767818107de8..cb23487e6fbe 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -177,11 +177,9 @@ public:
   bool mayNeedRelaxation(const MCInst &Inst,
                          const MCSubtargetInfo &STI) const override;
 
-  bool fixupNeedsRelaxationAdvanced(const MCAssembler &Asm,
-                                    const MCFixup &Fixup, bool Resolved,
-                                    uint64_t Value,
-                                    const MCRelaxableFragment *DF,
-                                    const bool WasForced) const override;
+  bool fixupNeedsRelaxationAdvanced(const MCAssembler &,
+                                    const MCFixup &, const MCValue &, uint64_t,
+                                    bool) const override;
 
   void relaxInstruction(MCInst &Inst,
                         const MCSubtargetInfo &STI) const override;
@@ -731,22 +729,22 @@ bool X86AsmBackend::mayNeedRelaxation(const MCInst &MI,
           MI.getOperand(MI.getNumOperands() - 1 - SkipOperands).isExpr());
 }
 
-bool X86AsmBackend::fixupNeedsRelaxationAdvanced(const MCAssembler &Asm,
+bool X86AsmBackend::fixupNeedsRelaxationAdvanced(const MCAssembler &,
                                                  const MCFixup &Fixup,
-                                                 bool Resolved, uint64_t Value,
-                                                 const MCRelaxableFragment *DF,
-                                                 const bool WasForced) const {
+                                                 const MCValue &Target,
+                                                 uint64_t Value,
+                                                 bool Resolved) const {
   // If resolved, relax if the value is too big for a (signed) i8.
+  //
+  // Currently, `jmp local@plt` relaxes JMP even if the offset is small,
+  // different from gas.
   if (Resolved)
-    return !isInt<8>(Value);
+    return !isInt<8>(Value) || Target.getSpecifier();
 
   // Otherwise, relax unless there is a @ABS8 specifier.
-  if (Fixup.getKind() == FK_Data_1) {
-    MCValue Target;
-    if (Fixup.getValue()->evaluateAsRelocatable(Target, &Asm) &&
-        Target.getAddSym() && Target.getSpecifier() == X86MCExpr::VK_ABS8)
-      return false;
-  }
+  if (Fixup.getKind() == FK_Data_1 && Target.getAddSym() &&
+      Target.getSpecifier() == X86MCExpr::VK_ABS8)
+    return false;
   return true;
 }
 
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 38761e1fd7ee..577428cad6d6 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -338,7 +338,7 @@ def FeatureAVX10_1 : SubtargetFeature<"avx10.1-256", "HasAVX10_1", "true",
                                       "Support AVX10.1 up to 256-bit instruction",
                                       [FeatureCDI, FeatureVBMI, FeatureIFMA, FeatureVNNI,
                                        FeatureBF16, FeatureVPOPCNTDQ, FeatureVBMI2, FeatureBITALG,
-                                       FeatureVAES, FeatureVPCLMULQDQ, FeatureFP16]>;
+                                       FeatureFP16]>;
 def FeatureAVX10_1_512 : SubtargetFeature<"avx10.1-512", "HasAVX10_1_512", "true",
                                           "Support AVX10.1 up to 512-bit instruction",
                                           [FeatureAVX10_1, FeatureEVEX512]>;
diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp
index e4b7ed7cf9b6..2ae6dd6b3d1e 100644
--- a/llvm/lib/TargetParser/X86TargetParser.cpp
+++ b/llvm/lib/TargetParser/X86TargetParser.cpp
@@ -637,8 +637,7 @@ constexpr FeatureBitset ImpliedFeaturesAVXVNNI = FeatureAVX2;
 constexpr FeatureBitset ImpliedFeaturesAVX10_1 =
     FeatureAVX512CD | FeatureAVX512VBMI | FeatureAVX512IFMA |
     FeatureAVX512VNNI | FeatureAVX512BF16 | FeatureAVX512VPOPCNTDQ |
-    FeatureAVX512VBMI2 | FeatureAVX512BITALG | FeatureVAES | FeatureVPCLMULQDQ |
-    FeatureAVX512FP16;
+    FeatureAVX512VBMI2 | FeatureAVX512BITALG | FeatureAVX512FP16;
 constexpr FeatureBitset ImpliedFeaturesAVX10_1_512 =
     FeatureAVX10_1 | FeatureEVEX512;
 constexpr FeatureBitset ImpliedFeaturesAVX10_2 = FeatureAVX10_1;
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index df1f6fddeba6..a5e0251277d8 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -3807,8 +3807,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
           // Make sure we don't pick a previously existing caller edge of this
           // Node, which would be processed on a different iteration of the
           // outer loop over the saved CallerEdges.
-          if (std::find(CallerEdges.begin(), CallerEdges.end(), E) !=
-              CallerEdges.end())
+          if (llvm::is_contained(CallerEdges, E))
             continue;
           // The CallerAllocTypeForAlloc and CalleeEdgeAllocTypesForCallerEdge
           // are updated further below for all cases where we just invoked
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 19bf81137aab..f1b225c0f238 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -2645,7 +2645,8 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
       !Builder.GetInsertBlock()->getParent()->hasFnAttribute(
           Attribute::NoImplicitFloat)) {
     Type *EltTy = CastOp->getType()->getScalarType();
-    if (EltTy->isFloatingPointTy() && EltTy->isIEEE()) {
+    if (EltTy->isFloatingPointTy() &&
+        APFloat::hasSignBitInMSB(EltTy->getFltSemantics())) {
       Value *FAbs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, CastOp);
       return new BitCastInst(FAbs, I.getType());
     }
@@ -4058,7 +4059,8 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
       !Builder.GetInsertBlock()->getParent()->hasFnAttribute(
           Attribute::NoImplicitFloat)) {
     Type *EltTy = CastOp->getType()->getScalarType();
-    if (EltTy->isFloatingPointTy() && EltTy->isIEEE()) {
+    if (EltTy->isFloatingPointTy() &&
+        APFloat::hasSignBitInMSB(EltTy->getFltSemantics())) {
       Value *FAbs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, CastOp);
       Value *FNegFAbs = Builder.CreateFNeg(FAbs);
       return new BitCastInst(FNegFAbs, I.getType());
@@ -4860,7 +4862,8 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
         !Builder.GetInsertBlock()->getParent()->hasFnAttribute(
             Attribute::NoImplicitFloat)) {
       Type *EltTy = CastOp->getType()->getScalarType();
-      if (EltTy->isFloatingPointTy() && EltTy->isIEEE()) {
+      if (EltTy->isFloatingPointTy() &&
+          APFloat::hasSignBitInMSB(EltTy->getFltSemantics())) {
         Value *FNeg = Builder.CreateFNeg(CastOp);
         return new BitCastInst(FNeg, I.getType());
       }
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index d872a381050c..889b43a843be 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -2294,10 +2294,14 @@ collectPromotionCandidates(MemorySSA *MSSA, AliasAnalysis *AA, Loop *L) {
   AliasSetTracker AST(BatchAA);
 
   auto IsPotentiallyPromotable = [L](const Instruction *I) {
-    if (const auto *SI = dyn_cast<StoreInst>(I))
-      return L->isLoopInvariant(SI->getPointerOperand());
-    if (const auto *LI = dyn_cast<LoadInst>(I))
-      return L->isLoopInvariant(LI->getPointerOperand());
+    if (const auto *SI = dyn_cast<StoreInst>(I)) {
+      const Value *PtrOp = SI->getPointerOperand();
+      return !isa<ConstantData>(PtrOp) && L->isLoopInvariant(PtrOp);
+    }
+    if (const auto *LI = dyn_cast<LoadInst>(I)) {
+      const Value *PtrOp = LI->getPointerOperand();
+      return !isa<ConstantData>(PtrOp) && L->isLoopInvariant(PtrOp);
+    }
     return false;
   };
 
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 60619dbe2f58..950344722b5c 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -754,9 +754,7 @@ public:
   /// Erase \p Inst from both ShapeMap (if an entry exists) and erase \p Inst
   /// itself.
   void eraseFromParentAndRemoveFromShapeMap(Instruction *Inst) {
-    auto Iter = ShapeMap.find(Inst);
-    if (Iter != ShapeMap.end())
-      ShapeMap.erase(Iter);
+    ShapeMap.erase(Inst);
     Inst->eraseFromParent();
   }
 
diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index 2f5cf45a1d3d..f98a69380464 100644
--- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -3287,20 +3287,21 @@ static void computeLiveInValues(DominatorTree &DT, Function &F,
   // Seed the liveness for each individual block
   for (BasicBlock &BB : F) {
     Data.KillSet[&BB] = computeKillSet(&BB, GC);
-    Data.LiveSet[&BB].clear();
-    computeLiveInValues(BB.rbegin(), BB.rend(), Data.LiveSet[&BB], GC);
+    auto &LiveSet = Data.LiveSet[&BB];
+    LiveSet.clear();
+    computeLiveInValues(BB.rbegin(), BB.rend(), LiveSet, GC);
 
 #ifndef NDEBUG
     for (Value *Kill : Data.KillSet[&BB])
       assert(!Data.LiveSet[&BB].count(Kill) && "live set contains kill");
 #endif
 
-    Data.LiveOut[&BB] = SetVector<Value *>();
-    computeLiveOutSeed(&BB, Data.LiveOut[&BB], GC);
-    Data.LiveIn[&BB] = Data.LiveSet[&BB];
-    Data.LiveIn[&BB].set_union(Data.LiveOut[&BB]);
-    Data.LiveIn[&BB].set_subtract(Data.KillSet[&BB]);
-    if (!Data.LiveIn[&BB].empty())
+    auto &Out = Data.LiveOut[&BB] = SetVector<Value *>();
+    computeLiveOutSeed(&BB, Out, GC);
+    auto &In = Data.LiveIn[&BB] = Data.LiveSet[&BB];
+    In.set_union(Out);
+    In.set_subtract(Data.KillSet[&BB]);
+    if (!In.empty())
       Worklist.insert_range(predecessors(&BB));
   }
 
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 41bf202230e2..e25ec6c3b2a5 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -111,20 +111,23 @@ Value *SCEVExpander::ReuseOrCreateCast(Value *V, Type *Ty,
 
   Value *Ret = nullptr;
 
-  // Check to see if there is already a cast!
-  for (User *U : V->users()) {
-    if (U->getType() != Ty)
-      continue;
-    CastInst *CI = dyn_cast<CastInst>(U);
-    if (!CI || CI->getOpcode() != Op)
-      continue;
+  if (!isa<Constant>(V)) {
+    // Check to see if there is already a cast!
+    for (User *U : V->users()) {
+      if (U->getType() != Ty)
+        continue;
+      CastInst *CI = dyn_cast<CastInst>(U);
+      if (!CI || CI->getOpcode() != Op)
+        continue;
 
-    // Found a suitable cast that is at IP or comes before IP. Use it. Note that
-    // the cast must also properly dominate the Builder's insertion point.
-    if (IP->getParent() == CI->getParent() && &*BIP != CI &&
-        (&*IP == CI || CI->comesBefore(&*IP))) {
-      Ret = CI;
-      break;
+      // Found a suitable cast that is at IP or comes before IP. Use it. Note
+      // that the cast must also properly dominate the Builder's insertion
+      // point.
+      if (IP->getParent() == CI->getParent() && &*BIP != CI &&
+          (&*IP == CI || CI->comesBefore(&*IP))) {
+        Ret = CI;
+        break;
+      }
     }
   }
 
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index eac7e7c209c9..7f53aa7d4f73 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -4087,9 +4087,7 @@ bool llvm::foldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU,
 
   Instruction *Cond = dyn_cast<Instruction>(BI->getCondition());
 
-  if (!Cond ||
-      (!isa<CmpInst>(Cond) && !isa<BinaryOperator>(Cond) &&
-       !isa<SelectInst>(Cond)) ||
+  if (!Cond || !isa<CmpInst, BinaryOperator, SelectInst, TruncInst>(Cond) ||
       Cond->getParent() != BB || !Cond->hasOneUse())
     return false;
 
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 2d0027d97601..4e37c587dc97 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -3002,6 +3002,9 @@ Value *LibCallSimplifier::optimizeSinCosPi(CallInst *CI, bool IsSin, IRBuilderBa
     return nullptr;
 
   Value *Arg = CI->getArgOperand(0);
+  if (isa<ConstantData>(Arg))
+    return nullptr;
+
   SmallVector<CallInst *, 1> SinCalls;
   SmallVector<CallInst *, 1> CosCalls;
   SmallVector<CallInst *, 1> SinCosCalls;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0acca63503af..a28cda9fe62b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1373,21 +1373,6 @@ public:
     return false;
   }
 
-  /// Returns true if we're required to use a scalar epilogue for at least
-  /// the final iteration of the original loop for all VFs in \p Range.
-  /// A scalar epilogue must either be required for all VFs in \p Range or for
-  /// none.
-  bool requiresScalarEpilogue(VFRange Range) const {
-    auto RequiresScalarEpilogue = [this](ElementCount VF) {
-      return requiresScalarEpilogue(VF.isVector());
-    };
-    bool IsRequired = all_of(Range, RequiresScalarEpilogue);
-    assert(
-        (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
-        "all VFs in range must agree on whether a scalar epilogue is required");
-    return IsRequired;
-  }
-
   /// Returns true if a scalar epilogue is not allowed due to optsize or a
   /// loop hint annotation.
   bool isScalarEpilogueAllowed() const {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index 2cd23efcf3ea..7934c47ee5ba 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -84,13 +84,15 @@ struct deferredval_ty {
 /// whichever value m_VPValue(X) populated.
 inline deferredval_ty m_Deferred(VPValue *const &V) { return V; }
 
-/// Match a specified integer value or vector of all elements of that
-/// value. \p BitWidth optionally specifies the bitwidth the matched constant
-/// must have. If it is 0, the matched constant can have any bitwidth.
-template <unsigned BitWidth = 0> struct specific_intval {
-  APInt Val;
+/// Match an integer constant or vector of constants if Pred::isValue returns
+/// true for the APInt. \p BitWidth optionally specifies the bitwidth the
+/// matched constant must have. If it is 0, the matched constant can have any
+/// bitwidth.
+template <typename Pred, unsigned BitWidth = 0> struct int_pred_ty {
+  Pred P;
 
-  specific_intval(APInt V) : Val(std::move(V)) {}
+  int_pred_ty(Pred P) : P(std::move(P)) {}
+  int_pred_ty() : P() {}
 
   bool match(VPValue *VPV) const {
     if (!VPV->isLiveIn())
@@ -108,17 +110,45 @@ template <unsigned BitWidth = 0> struct specific_intval {
 
     if (BitWidth != 0 && CI->getBitWidth() != BitWidth)
       return false;
-    return APInt::isSameValue(CI->getValue(), Val);
+    return P.isValue(CI->getValue());
   }
 };
 
+/// Match a specified integer value or vector of all elements of that
+/// value. \p BitWidth optionally specifies the bitwidth the matched constant
+/// must have. If it is 0, the matched constant can have any bitwidth.
+struct is_specific_int {
+  APInt Val;
+
+  is_specific_int(APInt Val) : Val(std::move(Val)) {}
+
+  bool isValue(const APInt &C) const { return APInt::isSameValue(Val, C); }
+};
+
+template <unsigned Bitwidth = 0>
+using specific_intval = int_pred_ty<is_specific_int, Bitwidth>;
+
 inline specific_intval<0> m_SpecificInt(uint64_t V) {
-  return specific_intval<0>(APInt(64, V));
+  return specific_intval<0>(is_specific_int(APInt(64, V)));
 }
 
-inline specific_intval<1> m_False() { return specific_intval<1>(APInt(64, 0)); }
+inline specific_intval<1> m_False() {
+  return specific_intval<1>(is_specific_int(APInt(64, 0)));
+}
 
-inline specific_intval<1> m_True() { return specific_intval<1>(APInt(64, 1)); }
+inline specific_intval<1> m_True() {
+  return specific_intval<1>(is_specific_int(APInt(64, 1)));
+}
+
+struct is_all_ones {
+  bool isValue(const APInt &C) const { return C.isAllOnes(); }
+};
+
+/// Match an integer or vector with all bits set.
+/// For vectors, this includes constants with undefined elements.
+inline int_pred_ty<is_all_ones> m_AllOnes() {
+  return int_pred_ty<is_all_ones>();
+}
 
 /// Matching combinators
 template <typename LTy, typename RTy> struct match_combine_or {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 67a55aa67c97..dc9f953f7447 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -994,6 +994,14 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
     return;
   }
 
+  // OR x, 1 -> 1.
+  if (match(&R, m_c_BinaryOr(m_VPValue(X), m_AllOnes()))) {
+    R.getVPSingleValue()->replaceAllUsesWith(
+        R.getOperand(0) == X ? R.getOperand(1) : R.getOperand(0));
+    R.eraseFromParent();
+    return;
+  }
+
   if (match(&R, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
     return R.getVPSingleValue()->replaceAllUsesWith(X);
 
diff --git a/llvm/test/Analysis/ScalarEvolution/pr135531.ll b/llvm/test/Analysis/ScalarEvolution/pr135531.ll
new file mode 100644
index 000000000000..e172d56d3a51
--- /dev/null
+++ b/llvm/test/Analysis/ScalarEvolution/pr135531.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -disable-output -passes='print<scalar-evolution>' < %s 2>&1 | FileCheck %s
+
+define i32 @pr135511(i32 %x) {
+; CHECK-LABEL: 'pr135511'
+; CHECK-NEXT:  Classifying expressions for: @pr135511
+; CHECK-NEXT:    %and = and i32 %x, 16382
+; CHECK-NEXT:    --> (2 * (zext i13 (trunc i32 (%x /u 2) to i13) to i32))<nuw><nsw> U: [0,16383) S: [0,16383)
+; CHECK-NEXT:    %neg = sub nsw i32 0, %and
+; CHECK-NEXT:    --> (-2 * (zext i13 (trunc i32 (%x /u 2) to i13) to i32))<nsw> U: [0,-1) S: [-16382,1)
+; CHECK-NEXT:    %res = and i32 %neg, 268431360
+; CHECK-NEXT:    --> (4096 * (zext i16 (trunc i32 ((-1 * (zext i13 (trunc i32 (%x /u 2) to i13) to i32))<nsw> /u 2048) to i16) to i32))<nuw><nsw> U: [0,268431361) S: [0,268431361)
+; CHECK-NEXT:  Determining loop execution counts for: @pr135511
+;
+  %and = and i32 %x, 16382
+  %neg = sub nsw i32 0, %and
+  %res = and i32 %neg, 268431360
+  ret i32 %res
+}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-fill-spill-pair.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-fill-spill-pair.ll
new file mode 100644
index 000000000000..503ead4eba2d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-sve-fill-spill-pair.ll
@@ -0,0 +1,283 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+sve -aarch64-sve-vector-bits-min=128 -aarch64-sve-vector-bits-max=128 < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=aarch64_be-linux-gnu -mattr=+sve -aarch64-sve-vector-bits-min=128 -aarch64-sve-vector-bits-max=128 < %s | FileCheck %s --check-prefixes=CHECK-BE
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+sve,ldp-aligned-only -aarch64-sve-vector-bits-min=128 -aarch64-sve-vector-bits-max=128 < %s | FileCheck %s --check-prefixes=CHECK-LDPALIGNEDONLY
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+sve,stp-aligned-only -aarch64-sve-vector-bits-min=128 -aarch64-sve-vector-bits-max=128 < %s | FileCheck %s --check-prefixes=CHECK-STPALIGNEDONLY
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s --check-prefixes=CHECK-OFF
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+sve -aarch64-sve-vector-bits-min=256 -aarch64-sve-vector-bits-max=256 < %s | FileCheck %s --check-prefixes=CHECK-OFF
+
+define void @nxv16i8(ptr %ldptr, ptr %stptr) {
+; CHECK-LABEL: nxv16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    stp q0, q1, [x1]
+; CHECK-NEXT:    ret
+;
+; CHECK-BE-LABEL: nxv16i8:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ptrue p0.b
+; CHECK-BE-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-BE-NEXT:    ld1b { z1.b }, p0/z, [x0, #1, mul vl]
+; CHECK-BE-NEXT:    st1b { z0.b }, p0, [x1]
+; CHECK-BE-NEXT:    st1b { z1.b }, p0, [x1, #1, mul vl]
+; CHECK-BE-NEXT:    ret
+;
+; CHECK-LDPALIGNEDONLY-LABEL: nxv16i8:
+; CHECK-LDPALIGNEDONLY:       // %bb.0:
+; CHECK-LDPALIGNEDONLY-NEXT:    ldr z0, [x0]
+; CHECK-LDPALIGNEDONLY-NEXT:    ldr z1, [x0, #1, mul vl]
+; CHECK-LDPALIGNEDONLY-NEXT:    stp q0, q1, [x1]
+; CHECK-LDPALIGNEDONLY-NEXT:    ret
+;
+; CHECK-STPALIGNEDONLY-LABEL: nxv16i8:
+; CHECK-STPALIGNEDONLY:       // %bb.0:
+; CHECK-STPALIGNEDONLY-NEXT:    ldp q0, q1, [x0]
+; CHECK-STPALIGNEDONLY-NEXT:    str z0, [x1]
+; CHECK-STPALIGNEDONLY-NEXT:    str z1, [x1, #1, mul vl]
+; CHECK-STPALIGNEDONLY-NEXT:    ret
+;
+; CHECK-OFF-LABEL: nxv16i8:
+; CHECK-OFF:       // %bb.0:
+; CHECK-OFF-NEXT:    ldr z0, [x0]
+; CHECK-OFF-NEXT:    ldr z1, [x0, #1, mul vl]
+; CHECK-OFF-NEXT:    str z0, [x1]
+; CHECK-OFF-NEXT:    str z1, [x1, #1, mul vl]
+; CHECK-OFF-NEXT:    ret
+  %vscale = tail call i64 @llvm.vscale()
+  %vl = shl nuw nsw i64 %vscale, 4
+  %ldptr2 = getelementptr inbounds nuw i8, ptr %ldptr, i64 %vl
+  %stptr2 = getelementptr inbounds nuw i8, ptr %stptr, i64 %vl
+  %ld1 = load <vscale x 16 x i8>, ptr %ldptr, align 1
+  %ld2 = load <vscale x 16 x i8>, ptr %ldptr2, align 1
+  store <vscale x 16 x i8> %ld1, ptr %stptr, align 1
+  store <vscale x 16 x i8> %ld2, ptr %stptr2, align 1
+  ret void
+}
+
+define void @nxv16i8_max_range(ptr %ldptr, ptr %stptr) {
+; CHECK-LABEL: nxv16i8_max_range:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0, #-1024]
+; CHECK-NEXT:    stp q0, q1, [x1, #1008]
+; CHECK-NEXT:    ret
+;
+; CHECK-BE-LABEL: nxv16i8_max_range:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    rdvl x8, #1
+; CHECK-BE-NEXT:    mov x9, #-1008 // =0xfffffffffffffc10
+; CHECK-BE-NEXT:    mov x10, #-1024 // =0xfffffffffffffc00
+; CHECK-BE-NEXT:    lsr x8, x8, #4
+; CHECK-BE-NEXT:    mov w11, #1008 // =0x3f0
+; CHECK-BE-NEXT:    mov w12, #1024 // =0x400
+; CHECK-BE-NEXT:    ptrue p0.b
+; CHECK-BE-NEXT:    mul x9, x8, x9
+; CHECK-BE-NEXT:    mul x10, x8, x10
+; CHECK-BE-NEXT:    mul x11, x8, x11
+; CHECK-BE-NEXT:    ld1b { z1.b }, p0/z, [x0, x9]
+; CHECK-BE-NEXT:    mul x8, x8, x12
+; CHECK-BE-NEXT:    ld1b { z0.b }, p0/z, [x0, x10]
+; CHECK-BE-NEXT:    st1b { z0.b }, p0, [x1, x11]
+; CHECK-BE-NEXT:    st1b { z1.b }, p0, [x1, x8]
+; CHECK-BE-NEXT:    ret
+;
+; CHECK-LDPALIGNEDONLY-LABEL: nxv16i8_max_range:
+; CHECK-LDPALIGNEDONLY:       // %bb.0:
+; CHECK-LDPALIGNEDONLY-NEXT:    ldr z0, [x0, #-64, mul vl]
+; CHECK-LDPALIGNEDONLY-NEXT:    ldr z1, [x0, #-63, mul vl]
+; CHECK-LDPALIGNEDONLY-NEXT:    stp q0, q1, [x1, #1008]
+; CHECK-LDPALIGNEDONLY-NEXT:    ret
+;
+; CHECK-STPALIGNEDONLY-LABEL: nxv16i8_max_range:
+; CHECK-STPALIGNEDONLY:       // %bb.0:
+; CHECK-STPALIGNEDONLY-NEXT:    ldp q0, q1, [x0, #-1024]
+; CHECK-STPALIGNEDONLY-NEXT:    str z0, [x1, #63, mul vl]
+; CHECK-STPALIGNEDONLY-NEXT:    str z1, [x1, #64, mul vl]
+; CHECK-STPALIGNEDONLY-NEXT:    ret
+;
+; CHECK-OFF-LABEL: nxv16i8_max_range:
+; CHECK-OFF:       // %bb.0:
+; CHECK-OFF-NEXT:    ldr z0, [x0, #-64, mul vl]
+; CHECK-OFF-NEXT:    ldr z1, [x0, #-63, mul vl]
+; CHECK-OFF-NEXT:    str z0, [x1, #63, mul vl]
+; CHECK-OFF-NEXT:    str z1, [x1, #64, mul vl]
+; CHECK-OFF-NEXT:    ret
+  %vscale = tail call i64 @llvm.vscale()
+  %ldoff1 = mul i64 %vscale, -1024
+  %ldoff2 = mul i64 %vscale, -1008
+  %stoff1 = mul i64 %vscale, 1008
+  %stoff2 = mul i64 %vscale, 1024
+  %ldptr1 = getelementptr inbounds nuw i8, ptr %ldptr, i64 %ldoff1
+  %ldptr2 = getelementptr inbounds nuw i8, ptr %ldptr, i64 %ldoff2
+  %stptr1 = getelementptr inbounds nuw i8, ptr %stptr, i64 %stoff1
+  %stptr2 = getelementptr inbounds nuw i8, ptr %stptr, i64 %stoff2
+  %ld1 = load <vscale x 16 x i8>, ptr %ldptr1, align 1
+  %ld2 = load <vscale x 16 x i8>, ptr %ldptr2, align 1
+  store <vscale x 16 x i8> %ld1, ptr %stptr1, align 1
+  store <vscale x 16 x i8> %ld2, ptr %stptr2, align 1
+  ret void
+}
+
+define void @nxv16i8_outside_range(ptr %ldptr, ptr %stptr) {
+; CHECK-LABEL: nxv16i8_outside_range:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr z0, [x0, #-65, mul vl]
+; CHECK-NEXT:    ldr z1, [x0, #-64, mul vl]
+; CHECK-NEXT:    str z0, [x1, #64, mul vl]
+; CHECK-NEXT:    str z1, [x1, #65, mul vl]
+; CHECK-NEXT:    ret
+;
+; CHECK-BE-LABEL: nxv16i8_outside_range:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    rdvl x8, #1
+; CHECK-BE-NEXT:    mov x9, #-1040 // =0xfffffffffffffbf0
+; CHECK-BE-NEXT:    mov x10, #-1024 // =0xfffffffffffffc00
+; CHECK-BE-NEXT:    lsr x8, x8, #4
+; CHECK-BE-NEXT:    mov w11, #1024 // =0x400
+; CHECK-BE-NEXT:    mov w12, #1040 // =0x410
+; CHECK-BE-NEXT:    ptrue p0.b
+; CHECK-BE-NEXT:    mul x9, x8, x9
+; CHECK-BE-NEXT:    mul x10, x8, x10
+; CHECK-BE-NEXT:    mul x11, x8, x11
+; CHECK-BE-NEXT:    ld1b { z0.b }, p0/z, [x0, x9]
+; CHECK-BE-NEXT:    mul x8, x8, x12
+; CHECK-BE-NEXT:    ld1b { z1.b }, p0/z, [x0, x10]
+; CHECK-BE-NEXT:    st1b { z0.b }, p0, [x1, x11]
+; CHECK-BE-NEXT:    st1b { z1.b }, p0, [x1, x8]
+; CHECK-BE-NEXT:    ret
+;
+; CHECK-LDPALIGNEDONLY-LABEL: nxv16i8_outside_range:
+; CHECK-LDPALIGNEDONLY:       // %bb.0:
+; CHECK-LDPALIGNEDONLY-NEXT:    ldr z0, [x0, #-65, mul vl]
+; CHECK-LDPALIGNEDONLY-NEXT:    ldr z1, [x0, #-64, mul vl]
+; CHECK-LDPALIGNEDONLY-NEXT:    str z0, [x1, #64, mul vl]
+; CHECK-LDPALIGNEDONLY-NEXT:    str z1, [x1, #65, mul vl]
+; CHECK-LDPALIGNEDONLY-NEXT:    ret
+;
+; CHECK-STPALIGNEDONLY-LABEL: nxv16i8_outside_range:
+; CHECK-STPALIGNEDONLY:       // %bb.0:
+; CHECK-STPALIGNEDONLY-NEXT:    ldr z0, [x0, #-65, mul vl]
+; CHECK-STPALIGNEDONLY-NEXT:    ldr z1, [x0, #-64, mul vl]
+; CHECK-STPALIGNEDONLY-NEXT:    str z0, [x1, #64, mul vl]
+; CHECK-STPALIGNEDONLY-NEXT:    str z1, [x1, #65, mul vl]
+; CHECK-STPALIGNEDONLY-NEXT:    ret
+;
+; CHECK-OFF-LABEL: nxv16i8_outside_range:
+; CHECK-OFF:       // %bb.0:
+; CHECK-OFF-NEXT:    ldr z0, [x0, #-65, mul vl]
+; CHECK-OFF-NEXT:    ldr z1, [x0, #-64, mul vl]
+; CHECK-OFF-NEXT:    str z0, [x1, #64, mul vl]
+; CHECK-OFF-NEXT:    str z1, [x1, #65, mul vl]
+; CHECK-OFF-NEXT:    ret
+  %vscale = tail call i64 @llvm.vscale()
+  %ldoff1 = mul i64 %vscale, -1040
+  %ldoff2 = mul i64 %vscale, -1024
+  %stoff1 = mul i64 %vscale, 1024
+  %stoff2 = mul i64 %vscale, 1040
+  %ldptr1 = getelementptr inbounds nuw i8, ptr %ldptr, i64 %ldoff1
+  %ldptr2 = getelementptr inbounds nuw i8, ptr %ldptr, i64 %ldoff2
+  %stptr1 = getelementptr inbounds nuw i8, ptr %stptr, i64 %stoff1
+  %stptr2 = getelementptr inbounds nuw i8, ptr %stptr, i64 %stoff2
+  %ld1 = load <vscale x 16 x i8>, ptr %ldptr1, align 1
+  %ld2 = load <vscale x 16 x i8>, ptr %ldptr2, align 1
+  store <vscale x 16 x i8> %ld1, ptr %stptr1, align 1
+  store <vscale x 16 x i8> %ld2, ptr %stptr2, align 1
+  ret void
+}
+
+define void @nxv16i8_2vl_stride(ptr %ldptr, ptr %stptr) {
+; CHECK-LABEL: nxv16i8_2vl_stride:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    ldr z1, [x0, #2, mul vl]
+; CHECK-NEXT:    str z0, [x1]
+; CHECK-NEXT:    str z1, [x1, #2, mul vl]
+; CHECK-NEXT:    ret
+;
+; CHECK-BE-LABEL: nxv16i8_2vl_stride:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ptrue p0.b
+; CHECK-BE-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-BE-NEXT:    ld1b { z1.b }, p0/z, [x0, #2, mul vl]
+; CHECK-BE-NEXT:    st1b { z0.b }, p0, [x1]
+; CHECK-BE-NEXT:    st1b { z1.b }, p0, [x1, #2, mul vl]
+; CHECK-BE-NEXT:    ret
+;
+; CHECK-LDPALIGNEDONLY-LABEL: nxv16i8_2vl_stride:
+; CHECK-LDPALIGNEDONLY:       // %bb.0:
+; CHECK-LDPALIGNEDONLY-NEXT:    ldr z0, [x0]
+; CHECK-LDPALIGNEDONLY-NEXT:    ldr z1, [x0, #2, mul vl]
+; CHECK-LDPALIGNEDONLY-NEXT:    str z0, [x1]
+; CHECK-LDPALIGNEDONLY-NEXT:    str z1, [x1, #2, mul vl]
+; CHECK-LDPALIGNEDONLY-NEXT:    ret
+;
+; CHECK-STPALIGNEDONLY-LABEL: nxv16i8_2vl_stride:
+; CHECK-STPALIGNEDONLY:       // %bb.0:
+; CHECK-STPALIGNEDONLY-NEXT:    ldr z0, [x0]
+; CHECK-STPALIGNEDONLY-NEXT:    ldr z1, [x0, #2, mul vl]
+; CHECK-STPALIGNEDONLY-NEXT:    str z0, [x1]
+; CHECK-STPALIGNEDONLY-NEXT:    str z1, [x1, #2, mul vl]
+; CHECK-STPALIGNEDONLY-NEXT:    ret
+;
+; CHECK-OFF-LABEL: nxv16i8_2vl_stride:
+; CHECK-OFF:       // %bb.0:
+; CHECK-OFF-NEXT:    ldr z0, [x0]
+; CHECK-OFF-NEXT:    ldr z1, [x0, #2, mul vl]
+; CHECK-OFF-NEXT:    str z0, [x1]
+; CHECK-OFF-NEXT:    str z1, [x1, #2, mul vl]
+; CHECK-OFF-NEXT:    ret
+  %vscale = tail call i64 @llvm.vscale()
+  %vl = shl nuw nsw i64 %vscale, 5
+  %ldptr2 = getelementptr inbounds nuw i8, ptr %ldptr, i64 %vl
+  %stptr2 = getelementptr inbounds nuw i8, ptr %stptr, i64 %vl
+  %ld1 = load <vscale x 16 x i8>, ptr %ldptr, align 1
+  %ld2 = load <vscale x 16 x i8>, ptr %ldptr2, align 1
+  store <vscale x 16 x i8> %ld1, ptr %stptr, align 1
+  store <vscale x 16 x i8> %ld2, ptr %stptr2, align 1
+  ret void
+}
+
+define void @nxv2f64_32b_aligned(ptr %ldptr, ptr %stptr) {
+; CHECK-LABEL: nxv2f64_32b_aligned:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    stp q0, q1, [x1]
+; CHECK-NEXT:    ret
+;
+; CHECK-BE-LABEL: nxv2f64_32b_aligned:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ptrue p0.d
+; CHECK-BE-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-BE-NEXT:    ld1d { z1.d }, p0/z, [x0, #1, mul vl]
+; CHECK-BE-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-BE-NEXT:    st1d { z1.d }, p0, [x1, #1, mul vl]
+; CHECK-BE-NEXT:    ret
+;
+; CHECK-LDPALIGNEDONLY-LABEL: nxv2f64_32b_aligned:
+; CHECK-LDPALIGNEDONLY:       // %bb.0:
+; CHECK-LDPALIGNEDONLY-NEXT:    ldp q0, q1, [x0]
+; CHECK-LDPALIGNEDONLY-NEXT:    stp q0, q1, [x1]
+; CHECK-LDPALIGNEDONLY-NEXT:    ret
+;
+; CHECK-STPALIGNEDONLY-LABEL: nxv2f64_32b_aligned:
+; CHECK-STPALIGNEDONLY:       // %bb.0:
+; CHECK-STPALIGNEDONLY-NEXT:    ldp q0, q1, [x0]
+; CHECK-STPALIGNEDONLY-NEXT:    stp q0, q1, [x1]
+; CHECK-STPALIGNEDONLY-NEXT:    ret
+;
+; CHECK-OFF-LABEL: nxv2f64_32b_aligned:
+; CHECK-OFF:       // %bb.0:
+; CHECK-OFF-NEXT:    ldr z0, [x0]
+; CHECK-OFF-NEXT:    ldr z1, [x0, #1, mul vl]
+; CHECK-OFF-NEXT:    str z0, [x1]
+; CHECK-OFF-NEXT:    str z1, [x1, #1, mul vl]
+; CHECK-OFF-NEXT:    ret
+  %vscale = tail call i64 @llvm.vscale()
+  %vl = shl nuw nsw i64 %vscale, 4
+  %ldptr2 = getelementptr inbounds nuw i8, ptr %ldptr, i64 %vl
+  %stptr2 = getelementptr inbounds nuw i8, ptr %stptr, i64 %vl
+  %ld1 = load <vscale x 2 x double>, ptr %ldptr, align 32
+  %ld2 = load <vscale x 2 x double>, ptr %ldptr2, align 32
+  store <vscale x 2 x double> %ld1, ptr %stptr, align 32
+  store <vscale x 2 x double> %ld2, ptr %stptr2, align 32
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/branch-relax-block-size.mir b/llvm/test/CodeGen/AArch64/branch-relax-block-size.mir
index 2684a550d921..6f07d1b2a6ea 100644
--- a/llvm/test/CodeGen/AArch64/branch-relax-block-size.mir
+++ b/llvm/test/CodeGen/AArch64/branch-relax-block-size.mir
@@ -1,5 +1,6 @@
 # REQUIRES: asserts
 # RUN: llc -mtriple=aarch64--linux-gnu -run-pass=branch-relaxation -debug-only=branch-relaxation %s -o /dev/null 2>&1 | FileCheck %s
+# RUN: llc -mtriple=aarch64--linux-gnu -passes=branch-relaxation -debug-only=branch-relaxation %s -o /dev/null 2>&1 | FileCheck %s
 
 # Ensure meta instructions (e.g. CFI_INSTRUCTION) don't contribute to the code
 # size of a basic block.
diff --git a/llvm/test/CodeGen/AArch64/branch-relax-cross-section.mir b/llvm/test/CodeGen/AArch64/branch-relax-cross-section.mir
index db88bf0044a5..000246ad8299 100644
--- a/llvm/test/CodeGen/AArch64/branch-relax-cross-section.mir
+++ b/llvm/test/CodeGen/AArch64/branch-relax-cross-section.mir
@@ -1,6 +1,8 @@
 # RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass branch-relaxation -aarch64-b-offset-bits=64 -aarch64-tbz-offset-bits=9 -aarch64-cbz-offset-bits=9 %s -o - | FileCheck %s
 # RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass branch-relaxation -aarch64-tbz-offset-bits=9 -aarch64-cbz-offset-bits=9 %s -o - | FileCheck --check-prefix=INDIRECT %s
 
+# RUN: llc -mtriple=aarch64-none-linux-gnu -passes branch-relaxation -aarch64-tbz-offset-bits=9 -aarch64-cbz-offset-bits=9 %s -o - | FileCheck --check-prefix=INDIRECT %s
+
 --- |
   declare i32 @bar()
   declare i32 @baz()
diff --git a/llvm/test/CodeGen/AArch64/peephole-orr.mir b/llvm/test/CodeGen/AArch64/peephole-orr.mir
index 3431676438bd..f718328ecf2d 100644
--- a/llvm/test/CodeGen/AArch64/peephole-orr.mir
+++ b/llvm/test/CodeGen/AArch64/peephole-orr.mir
@@ -1,6 +1,49 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -run-pass=aarch64-mi-peephole-opt -o - -mtriple=aarch64-unknown-linux -verify-machineinstrs %s | FileCheck %s
-
+---
+name: copy_fpr128_gpr32
+body: |
+  bb.0:
+    liveins: $q0
+    ; CHECK-LABEL: name: copy_fpr128_gpr32
+    ; CHECK: liveins: $q0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr32 = COPY [[COPY]].ssub
+    ; CHECK-NEXT: [[FMOVSWr:%[0-9]+]]:gpr32 = FMOVSWr [[COPY1]]
+    %0:fpr128 = COPY $q0
+    %1:gpr32 = COPY %0.ssub:fpr128
+    %2:gpr32 = ORRWrs $wzr, killed %1:gpr32, 0
+...
+---
+name: copy_fpr32_gpr32
+body: |
+  bb.0:
+    liveins: $s0
+    ; CHECK-LABEL: name: copy_fpr32_gpr32
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr32 = COPY $s0
+    ; CHECK-NEXT: [[FMOVSWr:%[0-9]+]]:gpr32 = FMOVSWr [[COPY]]
+    %0:fpr32 = COPY $s0
+    %1:gpr32 = COPY %0:fpr32
+    %2:gpr32 = ORRWrs $wzr, killed %1:gpr32, 0
+...
+---
+name: copy_zpr_gpr32
+body: |
+  bb.0:
+    liveins: $z0
+    ; CHECK-LABEL: name: copy_zpr_gpr32
+    ; CHECK: liveins: $z0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:zpr = COPY $z0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr32 = COPY [[COPY]].ssub
+    ; CHECK-NEXT: [[FMOVSWr:%[0-9]+]]:gpr32 = FMOVSWr [[COPY1]]
+    %0:zpr = COPY $z0
+    %1:gpr32 = COPY %0.ssub:zpr
+    %2:gpr32 = ORRWrs $wzr, killed %1:gpr32, 0
+...
 ---
 name: copy_multiple_uses
 tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/AArch64/sve-vls-ldst-opt.mir b/llvm/test/CodeGen/AArch64/sve-vls-ldst-opt.mir
new file mode 100644
index 000000000000..49453bc17891
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-vls-ldst-opt.mir
@@ -0,0 +1,74 @@
+# RUN: llc -mtriple=aarch64-unknown-linux -mattr=+sve -aarch64-sve-vector-bits-min=128 -aarch64-sve-vector-bits-max=128 -run-pass=aarch64-ldst-opt -verify-machineinstrs %s -o - | FileCheck %s
+---
+name: pair-sve-fill-spill
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $x0, $x1
+    renamable $z0 = LDR_ZXI renamable $x0, 0 :: (load (<vscale x 1 x s128>))
+    renamable $z1 = LDR_ZXI killed renamable $x0, 1 :: (load (<vscale x 1 x s128>))
+    STR_ZXI killed renamable $z0, renamable $x1, 0 :: (store (<vscale x 1 x s128>))
+    STR_ZXI killed renamable $z1, killed renamable $x1, 1 :: (store (<vscale x 1 x s128>))
+    RET_ReallyLR
+...
+# CHECK-LABEL: name: pair-sve-fill-spill
+# CHECK: $q0, $q1 = LDPQi renamable $x0, 0 :: (load (<vscale x 1 x s128>))
+# CHECK: STPQi killed $q0, killed $q1, renamable $x1, 0 :: (store (<vscale x 1 x s128>))
+---
+name: do-not-pair-sve-with-neon-scaled
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $x0, $x1
+    ; SVE LDR + Neon LDR
+    renamable $z0 = LDR_ZXI renamable $x0, 0 :: (load (<vscale x 1 x s128>))
+    renamable $q1 = LDRQui renamable $x0, 1 :: (load (s128))
+    ; Neon LDR + SVE LDR
+    renamable $q2 = LDRQui renamable $x0, 3 :: (load (s128))
+    renamable $z3 = LDR_ZXI renamable $x0, 4 :: (load (<vscale x 1 x s128>))
+    ; SVE STR + Neon STR
+    STR_ZXI killed renamable $z0, renamable $x1, 0 :: (store (<vscale x 1 x s128>))
+    STRQui killed renamable $q1, renamable $x1, 1 :: (store (s128))
+    ; Neon STR + SVE STR
+    STRQui killed renamable $q2, renamable $x1, 3 :: (store (s128))
+    STR_ZXI killed renamable $z3, renamable $x1, 4 :: (store (<vscale x 1 x s128>))
+    RET_ReallyLR
+...
+# CHECK-LABEL: name: do-not-pair-sve-with-neon-scaled
+# CHECK: renamable $z0 = LDR_ZXI renamable $x0, 0 :: (load (<vscale x 1 x s128>))
+# CHECK: renamable $q1 = LDRQui renamable $x0, 1 :: (load (s128))
+# CHECK: renamable $q2 = LDRQui renamable $x0, 3 :: (load (s128))
+# CHECK: renamable $z3 = LDR_ZXI renamable $x0, 4 :: (load (<vscale x 1 x s128>))
+# CHECK: STR_ZXI killed renamable $z0, renamable $x1, 0 :: (store (<vscale x 1 x s128>))
+# CHECK: STRQui killed renamable $q1, renamable $x1, 1 :: (store (s128))
+# CHECK: STRQui killed renamable $q2, renamable $x1, 3 :: (store (s128))
+# CHECK: STR_ZXI killed renamable $z3, renamable $x1, 4 :: (store (<vscale x 1 x s128>))
+---
+name: do-not-pair-sve-with-neon-unscaled
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $x0, $x1
+    ; SVE LDR + Neon LDUR
+    renamable $z0 = LDR_ZXI renamable $x0, 0 :: (load (<vscale x 1 x s128>))
+    renamable $q1 = LDURQi renamable $x0, 16 :: (load (s128))
+    ; Neon LDUR + SVE LDR
+    renamable $q2 = LDURQi renamable $x0, 48 :: (load (s128))
+    renamable $z3 = LDR_ZXI renamable $x0, 4 :: (load (<vscale x 1 x s128>))
+    ; SVE STR + Neon STUR
+    STR_ZXI killed renamable $z0, renamable $x1, 0 :: (store (<vscale x 1 x s128>))
+    STURQi killed renamable $q1, renamable $x1, 16 :: (store (s128))
+    ; Neon STUR + SVE STR
+    STURQi killed renamable $q2, renamable $x1, 48 :: (store (s128))
+    STR_ZXI killed renamable $z3, renamable $x1, 4 :: (store (<vscale x 1 x s128>))
+    RET_ReallyLR
+...
+# CHECK-LABEL: name: do-not-pair-sve-with-neon-unscaled
+# CHECK: renamable $z0 = LDR_ZXI renamable $x0, 0 :: (load (<vscale x 1 x s128>))
+# CHECK: renamable $q1 = LDURQi renamable $x0, 16 :: (load (s128))
+# CHECK: renamable $q2 = LDURQi renamable $x0, 48 :: (load (s128))
+# CHECK: renamable $z3 = LDR_ZXI renamable $x0, 4 :: (load (<vscale x 1 x s128>))
+# CHECK: STR_ZXI killed renamable $z0, renamable $x1, 0 :: (store (<vscale x 1 x s128>))
+# CHECK: STURQi killed renamable $q1, renamable $x1, 16 :: (store (s128))
+# CHECK: STURQi killed renamable $q2, renamable $x1, 48 :: (store (s128))
+# CHECK: STR_ZXI killed renamable $z3, renamable $x1, 4 :: (store (<vscale x 1 x s128>))
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir b/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir
index e4d9fbfb1705..474ba71b0eba 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir
+++ b/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --amdgpu-s-branch-bits=5 -run-pass branch-relaxation %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --amdgpu-s-branch-bits=5 -passes=branch-relaxation %s -o - | FileCheck %s
 
 ---
 name:            branch_no_terminators
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-non-byte-sizes.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-non-byte-sizes.ll
new file mode 100644
index 000000000000..4095347d7862
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-non-byte-sizes.ll
@@ -0,0 +1,204 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s
+
+; Check that types where the store/allocation sizes don't match the type size
+; don't crash.
+
+
+define <7 x i9> @load_elem_i9_access_7xi9() {
+; CHECK-LABEL: @load_elem_i9_access_7xi9(
+; CHECK-NEXT:    [[P:%.*]] = alloca <16 x i9>, align 1, addrspace(5)
+; CHECK-NEXT:    [[G:%.*]] = getelementptr i8, ptr addrspace(5) [[P]], i64 4
+; CHECK-NEXT:    [[L:%.*]] = load <7 x i9>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <7 x i9> [[L]]
+;
+  %p = alloca <16 x i9>, align 1, addrspace(5)
+  %g = getelementptr i8, ptr addrspace(5) %p, i64 4
+  %l = load <7 x i9>, ptr addrspace(5) %g, align 1
+  ret <7 x i9> %l
+}
+
+define <8 x i1> @load_elem_i1_access_8xi1() {
+; CHECK-LABEL: @load_elem_i1_access_8xi1(
+; CHECK-NEXT:    [[P:%.*]] = alloca <16 x i1>, align 1, addrspace(5)
+; CHECK-NEXT:    [[G:%.*]] = getelementptr i8, ptr addrspace(5) [[P]], i64 4
+; CHECK-NEXT:    [[L:%.*]] = load <8 x i1>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <8 x i1> [[L]]
+;
+  %p = alloca <16 x i1>, align 1, addrspace(5)
+  %g = getelementptr i8, ptr addrspace(5) %p, i64 4
+  %l = load <8 x i1>, ptr addrspace(5) %g, align 1
+  ret <8 x i1> %l
+}
+
+define <3 x i1> @load_elem_i1_access_3xi1() {
+; CHECK-LABEL: @load_elem_i1_access_3xi1(
+; CHECK-NEXT:    [[P:%.*]] = alloca <16 x i1>, align 1, addrspace(5)
+; CHECK-NEXT:    [[G:%.*]] = getelementptr i8, ptr addrspace(5) [[P]], i64 4
+; CHECK-NEXT:    [[L:%.*]] = load <3 x i1>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <3 x i1> [[L]]
+;
+  %p = alloca <16 x i1>, align 1, addrspace(5)
+  %g = getelementptr i8, ptr addrspace(5) %p, i64 4
+  %l = load <3 x i1>, ptr addrspace(5) %g, align 1
+  ret <3 x i1> %l
+}
+
+define <3 x i1> @load_elem_i8_access_3xi1() {
+; CHECK-LABEL: @load_elem_i8_access_3xi1(
+; CHECK-NEXT:    [[P:%.*]] = alloca <8 x i8>, align 1, addrspace(5)
+; CHECK-NEXT:    store <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, ptr addrspace(5) [[P]], align 1
+; CHECK-NEXT:    [[G:%.*]] = getelementptr <4 x i8>, ptr addrspace(5) [[P]], i64 1
+; CHECK-NEXT:    [[L:%.*]] = load <3 x i1>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <3 x i1> [[L]]
+;
+  %p = alloca <8 x i8>, align 1, addrspace(5)
+  store <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, ptr addrspace(5) %p, align 1
+  %g = getelementptr <4 x i8>, ptr addrspace(5) %p, i64 1
+  %l = load <3 x i1>, ptr addrspace(5) %g, align 1
+  ret <3 x i1> %l
+}
+
+; This one is actually not problematic.
+define <8 x i1> @load_elem_i8_access_8xi1() {
+; CHECK-LABEL: @load_elem_i8_access_8xi1(
+; CHECK-NEXT:    [[P:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT:    ret <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>
+;
+  %p = alloca <8 x i8>, align 1, addrspace(5)
+  store <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, ptr addrspace(5) %p, align 1
+  %g = getelementptr <4 x i8>, ptr addrspace(5) %p, i64 1
+  %l = load <8 x i1>, ptr addrspace(5) %g, align 1
+  ret <8 x i1> %l
+}
+
+define <8 x i1> @storeload_elem_i1_access_8xi1() {
+; CHECK-LABEL: @storeload_elem_i1_access_8xi1(
+; CHECK-NEXT:    [[P:%.*]] = alloca <16 x i1>, align 1, addrspace(5)
+; CHECK-NEXT:    [[G:%.*]] = getelementptr i8, ptr addrspace(5) [[P]], i64 4
+; CHECK-NEXT:    store <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    [[L:%.*]] = load <8 x i1>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <8 x i1> [[L]]
+;
+  %p = alloca <16 x i1>, align 1, addrspace(5)
+  %g = getelementptr i8, ptr addrspace(5) %p, i64 4
+  store <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>, ptr addrspace(5) %g, align 1
+  %l = load <8 x i1>, ptr addrspace(5) %g, align 1
+  ret <8 x i1> %l
+}
+
+define <3 x i1> @storeload_elem_i1_access_3xi1() {
+; CHECK-LABEL: @storeload_elem_i1_access_3xi1(
+; CHECK-NEXT:    [[P:%.*]] = alloca <16 x i1>, align 1, addrspace(5)
+; CHECK-NEXT:    [[G:%.*]] = getelementptr i8, ptr addrspace(5) [[P]], i64 4
+; CHECK-NEXT:    store <3 x i1> <i1 true, i1 false, i1 true>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    [[L:%.*]] = load <3 x i1>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <3 x i1> [[L]]
+;
+  %p = alloca <16 x i1>, align 1, addrspace(5)
+  %g = getelementptr i8, ptr addrspace(5) %p, i64 4
+  store <3 x i1> <i1 true, i1 false, i1 true>, ptr addrspace(5) %g, align 1
+  %l = load <3 x i1>, ptr addrspace(5) %g, align 1
+  ret <3 x i1> %l
+}
+
+define <3 x i1> @storeload_elem_i8_access_3xi1() {
+; CHECK-LABEL: @storeload_elem_i8_access_3xi1(
+; CHECK-NEXT:    [[P:%.*]] = alloca <8 x i8>, align 1, addrspace(5)
+; CHECK-NEXT:    store <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, ptr addrspace(5) [[P]], align 1
+; CHECK-NEXT:    [[G:%.*]] = getelementptr <4 x i8>, ptr addrspace(5) [[P]], i64 1
+; CHECK-NEXT:    store <3 x i1> <i1 true, i1 false, i1 true>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    [[L:%.*]] = load <3 x i1>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <3 x i1> [[L]]
+;
+  %p = alloca <8 x i8>, align 1, addrspace(5)
+  store <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, ptr addrspace(5) %p, align 1
+  %g = getelementptr <4 x i8>, ptr addrspace(5) %p, i64 1
+  store <3 x i1> <i1 true, i1 false, i1 true>, ptr addrspace(5) %g, align 1
+  %l = load <3 x i1>, ptr addrspace(5) %g, align 1
+  ret <3 x i1> %l
+}
+
+; This one is actually not problematic.
+define <8 x i1> @storeload_elem_i8_access_8xi1() {
+; CHECK-LABEL: @storeload_elem_i8_access_8xi1(
+; CHECK-NEXT:    [[P:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT:    ret <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>
+;
+  %p = alloca <8 x i8>, align 1, addrspace(5)
+  store <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, ptr addrspace(5) %p, align 1
+  %g = getelementptr <4 x i8>, ptr addrspace(5) %p, i64 1
+  store <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>, ptr addrspace(5) %g, align 1
+  %l = load <8 x i1>, ptr addrspace(5) %g, align 1
+  ret <8 x i1> %l
+}
+
+define <8 x i1> @array_of_vec_elem_i1_access_8xi1() {
+; CHECK-LABEL: @array_of_vec_elem_i1_access_8xi1(
+; CHECK-NEXT:    [[P:%.*]] = alloca [2 x <16 x i1>], align 1, addrspace(5)
+; CHECK-NEXT:    [[G:%.*]] = getelementptr i8, ptr addrspace(5) [[P]], i64 4
+; CHECK-NEXT:    store <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    [[L:%.*]] = load <8 x i1>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <8 x i1> [[L]]
+;
+  %p = alloca [2 x <16 x i1>], align 1, addrspace(5)
+  %g = getelementptr i8, ptr addrspace(5) %p, i64 4
+  store <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>, ptr addrspace(5) %g, align 1
+  %l = load <8 x i1>, ptr addrspace(5) %g, align 1
+  ret <8 x i1> %l
+}
+
+define <3 x i1> @array_of_vec_elem_i1_access_3xi1() {
+; CHECK-LABEL: @array_of_vec_elem_i1_access_3xi1(
+; CHECK-NEXT:    [[P:%.*]] = alloca [2 x <16 x i1>], align 1, addrspace(5)
+; CHECK-NEXT:    [[G:%.*]] = getelementptr i8, ptr addrspace(5) [[P]], i64 4
+; CHECK-NEXT:    store <3 x i1> <i1 true, i1 false, i1 true>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    [[L:%.*]] = load <3 x i1>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <3 x i1> [[L]]
+;
+  %p = alloca [2 x <16 x i1>], align 1, addrspace(5)
+  %g = getelementptr i8, ptr addrspace(5) %p, i64 4
+  store <3 x i1> <i1 true, i1 false, i1 true>, ptr addrspace(5) %g, align 1
+  %l = load <3 x i1>, ptr addrspace(5) %g, align 1
+  ret <3 x i1> %l
+}
+
+define <3 x i1> @array_of_vec_elem_i8_access_3xi1() {
+; CHECK-LABEL: @array_of_vec_elem_i8_access_3xi1(
+; CHECK-NEXT:    [[P:%.*]] = alloca [2 x <8 x i8>], align 1, addrspace(5)
+; CHECK-NEXT:    store <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, ptr addrspace(5) [[P]], align 1
+; CHECK-NEXT:    [[G:%.*]] = getelementptr <4 x i8>, ptr addrspace(5) [[P]], i64 1
+; CHECK-NEXT:    store <3 x i1> <i1 true, i1 false, i1 true>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    [[L:%.*]] = load <3 x i1>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <3 x i1> [[L]]
+;
+  %p = alloca [2 x <8 x i8>], align 1, addrspace(5)
+  store <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, ptr addrspace(5) %p, align 1
+  %g = getelementptr <4 x i8>, ptr addrspace(5) %p, i64 1
+  store <3 x i1> <i1 true, i1 false, i1 true>, ptr addrspace(5) %g, align 1
+  %l = load <3 x i1>, ptr addrspace(5) %g, align 1
+  ret <3 x i1> %l
+}
+
+; This one is actually not problematic.
+define <8 x i1> @array_of_vec_elem_i8_access_8xi1() {
+; CHECK-LABEL: @array_of_vec_elem_i8_access_8xi1(
+; CHECK-NEXT:    [[P:%.*]] = freeze <16 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i8> [[P]], i8 1, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 2, i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x i8> [[TMP2]], i8 3, i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x i8> [[TMP3]], i8 4, i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 5, i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 6, i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x i8> [[TMP6]], i8 7, i32 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x i8> [[TMP7]], i8 8, i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x i8> [[TMP8]], i8 5, i32 4
+; CHECK-NEXT:    ret <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>
+;
+  %p = alloca [2 x <8 x i8>], align 1, addrspace(5)
+  store <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, ptr addrspace(5) %p, align 1
+  %g = getelementptr <4 x i8>, ptr addrspace(5) %p, i64 1
+  store <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>, ptr addrspace(5) %g, align 1
+  %l = load <8 x i1>, ptr addrspace(5) %g, align 1
+  ret <8 x i1> %l
+}
diff --git a/llvm/test/CodeGen/AMDGPU/swdev380865.ll b/llvm/test/CodeGen/AMDGPU/swdev380865.ll
index 9189cef019cf..4a5dc8f300af 100644
--- a/llvm/test/CodeGen/AMDGPU/swdev380865.ll
+++ b/llvm/test/CodeGen/AMDGPU/swdev380865.ll
@@ -16,15 +16,16 @@ define amdgpu_kernel void @_Z6kernelILi4000ELi1EEvPd(ptr addrspace(1) %x.coerce)
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_mov_b64 s[0:1], 0
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x0
+; CHECK-NEXT:    s_mov_b64 s[0:1], 0x100
 ; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
 ; CHECK-NEXT:    s_mov_b32 s4, 0
 ; CHECK-NEXT:    s_mov_b32 s0, 0
-; CHECK-NEXT:    s_mov_b32 s5, 0x40280000
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b32 s1, s2
 ; CHECK-NEXT:    s_mov_b32 s2, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s6
 ; CHECK-NEXT:    s_mov_b32 s3, 0x40260000
+; CHECK-NEXT:    s_mov_b32 s5, 0x40280000
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s7
 ; CHECK-NEXT:  .LBB0_1: ; %for.cond4.preheader
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -50,7 +51,7 @@ define amdgpu_kernel void @_Z6kernelILi4000ELi1EEvPd(ptr addrspace(1) %x.coerce)
 ; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[4:5]
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB0_1
 ; CHECK-NEXT:  ; %bb.2: ; %for.cond.cleanup.loopexit
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x100
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0
 ; CHECK-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; CHECK-NEXT:    s_endpgm
@@ -61,7 +62,7 @@ entry:
 
 for.cond4.preheader:                              ; preds = %for.cond4.preheader, %entry
   %idx.07 = phi i32 [ %add13, %for.cond4.preheader ], [ 0, %entry ]
-  %arrayidx.promoted = load double, ptr addrspace(1) null, align 8
+  %arrayidx.promoted = load double, ptr addrspace(1) inttoptr (i64 256 to ptr addrspace(1)), align 8
   %add9 = fadd contract double %arrayidx.promoted, 0.000000e+00
   %add9.1 = fadd contract double %add9, 5.000000e+00
   %add9.2 = fadd contract double %add9.1, 6.000000e+00
@@ -70,7 +71,7 @@ for.cond4.preheader:                              ; preds = %for.cond4.preheader
   %add9.5 = fadd contract double %add9.4, 1.000000e+01
   %add9.6 = fadd contract double %add9.5, 1.100000e+01
   %add9.7 = fadd contract double %add9.6, 1.200000e+01
-  store double %add9.7, ptr addrspace(1) null, align 8
+  store double %add9.7, ptr addrspace(1) inttoptr (i64 256 to ptr addrspace(1)), align 8
   %add13 = add i32 %idx.07, %0
   %cmp = icmp slt i32 %add13, 2560
   br i1 %cmp, label %for.cond4.preheader, label %for.cond.cleanup
diff --git a/llvm/test/CodeGen/PowerPC/pr43527.ll b/llvm/test/CodeGen/PowerPC/pr43527.ll
index 379bd6c070c7..adfea51077a0 100644
--- a/llvm/test/CodeGen/PowerPC/pr43527.ll
+++ b/llvm/test/CodeGen/PowerPC/pr43527.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -ppc-asm-full-reg-names -verify-machineinstrs \
 ; RUN:   -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s
 ; We don't want to produce a CTR loop due to the call to lrint in the body.
-define dso_local void @test(i64 %arg, i64 %arg1) {
+define dso_local void @test(i64 %arg, i64 %arg1, ptr %arg2) {
 ; CHECK-LABEL: test:
 ; CHECK:       # %bb.0: # %bb
 ; CHECK-NEXT:    bc 4, 4*cr5+lt, .LBB0_5
@@ -12,29 +12,33 @@ define dso_local void @test(i64 %arg, i64 %arg1) {
 ; CHECK-NEXT:    mflr r0
 ; CHECK-NEXT:    .cfi_def_cfa_offset 64
 ; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    .cfi_offset r28, -32
 ; CHECK-NEXT:    .cfi_offset r29, -24
 ; CHECK-NEXT:    .cfi_offset r30, -16
+; CHECK-NEXT:    std r28, -32(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    std r29, -24(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    stdu r1, -64(r1)
-; CHECK-NEXT:    sub r30, r4, r3
-; CHECK-NEXT:    li r29, -4
+; CHECK-NEXT:    mr r30, r5
+; CHECK-NEXT:    sub r29, r4, r3
+; CHECK-NEXT:    addi r28, r5, -4
 ; CHECK-NEXT:    std r0, 80(r1)
 ; CHECK-NEXT:    .p2align 5
 ; CHECK-NEXT:  .LBB0_3: # %bb5
 ; CHECK-NEXT:    #
-; CHECK-NEXT:    lfsu f1, 4(r29)
+; CHECK-NEXT:    lfsu f1, 4(r28)
 ; CHECK-NEXT:    bl lrint
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    addi r30, r30, -1
-; CHECK-NEXT:    cmpldi r30, 0
+; CHECK-NEXT:    addi r29, r29, -1
+; CHECK-NEXT:    stb r3, 0(r30)
+; CHECK-NEXT:    cmpldi r29, 0
 ; CHECK-NEXT:    bc 12, gt, .LBB0_3
 ; CHECK-NEXT:  # %bb.4: # %bb15
-; CHECK-NEXT:    stb r3, 0(r3)
 ; CHECK-NEXT:    addi r1, r1, 64
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
 ; CHECK-NEXT:    ld r29, -24(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld r28, -32(r1) # 8-byte Folded Reload
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
 ; CHECK-NEXT:  .LBB0_5: # %bb2
@@ -54,12 +58,12 @@ bb4:                                              ; preds = %bb3
 
 bb5:                                              ; preds = %bb5, %bb4
   %tmp6 = phi i64 [ %tmp12, %bb5 ], [ 0, %bb4 ]
-  %tmp7 = getelementptr inbounds float, ptr null, i64 %tmp6
+  %tmp7 = getelementptr inbounds float, ptr %arg2, i64 %tmp6
   %tmp8 = load float, ptr %tmp7, align 4
   %tmp9 = fpext float %tmp8 to double
   %tmp10 = tail call i64 @llvm.lrint.i64.f64(double %tmp9) #2
   %tmp11 = trunc i64 %tmp10 to i8
-  store i8 %tmp11, ptr undef, align 1
+  store i8 %tmp11, ptr %arg2, align 1
   %tmp12 = add nuw i64 %tmp6, 1
   %tmp13 = icmp eq i64 %tmp12, %tmp
   br i1 %tmp13, label %bb15, label %bb5
diff --git a/llvm/test/CodeGen/PowerPC/pr48519.ll b/llvm/test/CodeGen/PowerPC/pr48519.ll
index fa156454a131..b610f12159ee 100644
--- a/llvm/test/CodeGen/PowerPC/pr48519.ll
+++ b/llvm/test/CodeGen/PowerPC/pr48519.ll
@@ -32,7 +32,7 @@ define void @julia__typed_vcat_20() #0 {
 ; CHECK-NEXT:  # %bb.2: # %bb11
 ; CHECK-NEXT:    bl __truncsfhf2
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    sth r3, 0(r3)
+; CHECK-NEXT:    sth r3, 128(0)
 ;
 ; CHECK-P9-LABEL: julia__typed_vcat_20:
 ; CHECK-P9:       # %bb.0: # %bb
@@ -54,6 +54,7 @@ define void @julia__typed_vcat_20() #0 {
 ; CHECK-P9-NEXT:    bdnz .LBB0_1
 ; CHECK-P9-NEXT:  # %bb.2: # %bb11
 ; CHECK-P9-NEXT:    xscvdphp f0, f0
+; CHECK-P9-NEXT:    li r3, 128
 ; CHECK-P9-NEXT:    stxsihx f0, 0, r3
 bb:
   %i = load i64, ptr addrspace(11) null, align 8
@@ -67,7 +68,7 @@ bb3:                                              ; preds = %bb3, %bb
   %i6 = add nsw i64 %i5, -1
   %i7 = add i64 %i6, 0
   %i8 = sitofp i64 %i7 to half
-  store half %i8, ptr addrspace(13) undef, align 2
+  store half %i8, ptr addrspace(13) inttoptr (i64 128 to ptr addrspace(13)), align 2
   %i9 = icmp eq i64 %i4, 0
   %i10 = add i64 %i4, 1
   br i1 %i9, label %bb11, label %bb3
diff --git a/llvm/test/CodeGen/PowerPC/sms-grp-order.ll b/llvm/test/CodeGen/PowerPC/sms-grp-order.ll
index f72598cb4cbc..eaea47608eb2 100644
--- a/llvm/test/CodeGen/PowerPC/sms-grp-order.ll
+++ b/llvm/test/CodeGen/PowerPC/sms-grp-order.ll
@@ -2,32 +2,32 @@
 ; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs\
 ; RUN:       -mcpu=pwr9 --ppc-enable-pipeliner | FileCheck %s
 
-define void @lame_encode_buffer_interleaved() local_unnamed_addr {
+define void @lame_encode_buffer_interleaved(ptr %arg0) local_unnamed_addr {
 ; CHECK-LABEL: lame_encode_buffer_interleaved:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lha 3, 0(3)
-; CHECK-NEXT:    li 5, 1
-; CHECK-NEXT:    lhz 4, 0(0)
-; CHECK-NEXT:    rldic 5, 5, 62, 1
-; CHECK-NEXT:    mtctr 5
-; CHECK-NEXT:    srawi 3, 3, 1
-; CHECK-NEXT:    addze 3, 3
+; CHECK-NEXT:    li 4, 1
+; CHECK-NEXT:    rldic 4, 4, 62, 1
+; CHECK-NEXT:    mtctr 4
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB0_1:
-; CHECK-NEXT:    extsh 4, 4
+; CHECK-NEXT:    lha 4, 0(3)
+; CHECK-NEXT:    lha 5, 0(3)
 ; CHECK-NEXT:    srawi 4, 4, 1
 ; CHECK-NEXT:    addze 4, 4
+; CHECK-NEXT:    srawi 5, 5, 1
+; CHECK-NEXT:    addze 5, 5
+; CHECK-NEXT:    sth 4, 0(3)
+; CHECK-NEXT:    sth 5, 0(3)
 ; CHECK-NEXT:    bdnz .LBB0_1
 ; CHECK-NEXT:  # %bb.2:
-; CHECK-NEXT:    sth 4, 0(0)
-; CHECK-NEXT:    sth 3, 0(3)
 ; CHECK-NEXT:    blr
+  %undef = freeze ptr poison
   br label %1
 
 1:                                                ; preds = %1, %0
   %2 = phi i64 [ 0, %0 ], [ %13, %1 ]
-  %3 = load i16, ptr null, align 2
-  %4 = load i16, ptr undef, align 2
+  %3 = load i16, ptr %arg0, align 2
+  %4 = load i16, ptr %undef, align 2
   %5 = sext i16 %3 to i32
   %6 = sext i16 %4 to i32
   %7 = add nsw i32 0, %5
@@ -36,8 +36,8 @@ define void @lame_encode_buffer_interleaved() local_unnamed_addr {
   %10 = sdiv i32 %8, 2
   %11 = trunc i32 %9 to i16
   %12 = trunc i32 %10 to i16
-  store i16 %11, ptr null, align 2
-  store i16 %12, ptr undef, align 2
+  store i16 %11, ptr %arg0, align 2
+  store i16 %12, ptr %undef, align 2
   %13 = add i64 %2, 4
   %14 = icmp eq i64 %13, 0
   br i1 %14, label %15, label %1
diff --git a/llvm/test/CodeGen/RISCV/emit-x8-as-fp.ll b/llvm/test/CodeGen/RISCV/emit-x8-as-fp.ll
new file mode 100644
index 000000000000..0c9ef2176fd7
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/emit-x8-as-fp.ll
@@ -0,0 +1,121 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV32I-DEFAULT %s
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV64I-DEFAULT %s
+; RUN: llc -mtriple=riscv32 -M emit-x8-as-fp -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV32I-EMIT-FP %s
+; RUN: llc -mtriple=riscv64 -M emit-x8-as-fp -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV64I-EMIT-FP %s
+; RUN: llc -mtriple=riscv32 -M numeric -M emit-x8-as-fp -verify-machineinstrs \
+; RUN:   < %s | FileCheck -check-prefix=RV32I-NUMERIC %s
+; RUN: llc -mtriple=riscv64 -M numeric -M emit-x8-as-fp -verify-machineinstrs \
+; RUN:   < %s | FileCheck -check-prefix=RV64I-NUMERIC %s
+
+define signext i32 @add(i32 %0, i32 %1) #0 {
+; RV32I-DEFAULT-LABEL: add:
+; RV32I-DEFAULT:       # %bb.0:
+; RV32I-DEFAULT-NEXT:    addi sp, sp, -16
+; RV32I-DEFAULT-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-DEFAULT-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-DEFAULT-NEXT:    addi s0, sp, 16
+; RV32I-DEFAULT-NEXT:    sw a0, -12(s0)
+; RV32I-DEFAULT-NEXT:    sw a1, -16(s0)
+; RV32I-DEFAULT-NEXT:    lw a0, -12(s0)
+; RV32I-DEFAULT-NEXT:    lw a1, -16(s0)
+; RV32I-DEFAULT-NEXT:    add a0, a0, a1
+; RV32I-DEFAULT-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-DEFAULT-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-DEFAULT-NEXT:    addi sp, sp, 16
+; RV32I-DEFAULT-NEXT:    ret
+;
+; RV64I-DEFAULT-LABEL: add:
+; RV64I-DEFAULT:       # %bb.0:
+; RV64I-DEFAULT-NEXT:    addi sp, sp, -32
+; RV64I-DEFAULT-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-DEFAULT-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-DEFAULT-NEXT:    addi s0, sp, 32
+; RV64I-DEFAULT-NEXT:    sw a0, -20(s0)
+; RV64I-DEFAULT-NEXT:    sw a1, -24(s0)
+; RV64I-DEFAULT-NEXT:    lw a0, -20(s0)
+; RV64I-DEFAULT-NEXT:    lw a1, -24(s0)
+; RV64I-DEFAULT-NEXT:    addw a0, a0, a1
+; RV64I-DEFAULT-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-DEFAULT-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-DEFAULT-NEXT:    addi sp, sp, 32
+; RV64I-DEFAULT-NEXT:    ret
+;
+; RV32I-EMIT-FP-LABEL: add:
+; RV32I-EMIT-FP:       # %bb.0:
+; RV32I-EMIT-FP-NEXT:    addi sp, sp, -16
+; RV32I-EMIT-FP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-EMIT-FP-NEXT:    sw fp, 8(sp) # 4-byte Folded Spill
+; RV32I-EMIT-FP-NEXT:    addi fp, sp, 16
+; RV32I-EMIT-FP-NEXT:    sw a0, -12(fp)
+; RV32I-EMIT-FP-NEXT:    sw a1, -16(fp)
+; RV32I-EMIT-FP-NEXT:    lw a0, -12(fp)
+; RV32I-EMIT-FP-NEXT:    lw a1, -16(fp)
+; RV32I-EMIT-FP-NEXT:    add a0, a0, a1
+; RV32I-EMIT-FP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-EMIT-FP-NEXT:    lw fp, 8(sp) # 4-byte Folded Reload
+; RV32I-EMIT-FP-NEXT:    addi sp, sp, 16
+; RV32I-EMIT-FP-NEXT:    ret
+;
+; RV64I-EMIT-FP-LABEL: add:
+; RV64I-EMIT-FP:       # %bb.0:
+; RV64I-EMIT-FP-NEXT:    addi sp, sp, -32
+; RV64I-EMIT-FP-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-EMIT-FP-NEXT:    sd fp, 16(sp) # 8-byte Folded Spill
+; RV64I-EMIT-FP-NEXT:    addi fp, sp, 32
+; RV64I-EMIT-FP-NEXT:    sw a0, -20(fp)
+; RV64I-EMIT-FP-NEXT:    sw a1, -24(fp)
+; RV64I-EMIT-FP-NEXT:    lw a0, -20(fp)
+; RV64I-EMIT-FP-NEXT:    lw a1, -24(fp)
+; RV64I-EMIT-FP-NEXT:    addw a0, a0, a1
+; RV64I-EMIT-FP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-EMIT-FP-NEXT:    ld fp, 16(sp) # 8-byte Folded Reload
+; RV64I-EMIT-FP-NEXT:    addi sp, sp, 32
+; RV64I-EMIT-FP-NEXT:    ret
+;
+; RV32I-NUMERIC-LABEL: add:
+; RV32I-NUMERIC:       # %bb.0:
+; RV32I-NUMERIC-NEXT:    addi x2, x2, -16
+; RV32I-NUMERIC-NEXT:    sw x1, 12(x2) # 4-byte Folded Spill
+; RV32I-NUMERIC-NEXT:    sw x8, 8(x2) # 4-byte Folded Spill
+; RV32I-NUMERIC-NEXT:    addi x8, x2, 16
+; RV32I-NUMERIC-NEXT:    sw x10, -12(x8)
+; RV32I-NUMERIC-NEXT:    sw x11, -16(x8)
+; RV32I-NUMERIC-NEXT:    lw x10, -12(x8)
+; RV32I-NUMERIC-NEXT:    lw x11, -16(x8)
+; RV32I-NUMERIC-NEXT:    add x10, x10, x11
+; RV32I-NUMERIC-NEXT:    lw x1, 12(x2) # 4-byte Folded Reload
+; RV32I-NUMERIC-NEXT:    lw x8, 8(x2) # 4-byte Folded Reload
+; RV32I-NUMERIC-NEXT:    addi x2, x2, 16
+; RV32I-NUMERIC-NEXT:    ret
+;
+; RV64I-NUMERIC-LABEL: add:
+; RV64I-NUMERIC:       # %bb.0:
+; RV64I-NUMERIC-NEXT:    addi x2, x2, -32
+; RV64I-NUMERIC-NEXT:    sd x1, 24(x2) # 8-byte Folded Spill
+; RV64I-NUMERIC-NEXT:    sd x8, 16(x2) # 8-byte Folded Spill
+; RV64I-NUMERIC-NEXT:    addi x8, x2, 32
+; RV64I-NUMERIC-NEXT:    sw x10, -20(x8)
+; RV64I-NUMERIC-NEXT:    sw x11, -24(x8)
+; RV64I-NUMERIC-NEXT:    lw x10, -20(x8)
+; RV64I-NUMERIC-NEXT:    lw x11, -24(x8)
+; RV64I-NUMERIC-NEXT:    addw x10, x10, x11
+; RV64I-NUMERIC-NEXT:    ld x1, 24(x2) # 8-byte Folded Reload
+; RV64I-NUMERIC-NEXT:    ld x8, 16(x2) # 8-byte Folded Reload
+; RV64I-NUMERIC-NEXT:    addi x2, x2, 32
+; RV64I-NUMERIC-NEXT:    ret
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  store i32 %0, ptr %3, align 4
+  store i32 %1, ptr %4, align 4
+  %5 = load i32, ptr %3, align 4
+  %6 = load i32, ptr %4, align 4
+  %7 = add nsw i32 %5, %6
+  ret i32 %7
+}
+
+attributes #0 = { noinline nounwind optnone "frame-pointer"="all" }
diff --git a/llvm/test/CodeGen/X86/fake-use-remove-loads.mir b/llvm/test/CodeGen/X86/fake-use-remove-loads.mir
index 3f67f03c9a63..aa9839d2700a 100644
--- a/llvm/test/CodeGen/X86/fake-use-remove-loads.mir
+++ b/llvm/test/CodeGen/X86/fake-use-remove-loads.mir
@@ -3,6 +3,8 @@
 # remove-loads-into-fake-uses pass, and that if the function does not use
 # instruction referencing then no changes are made.
 # RUN: llc %s -run-pass remove-loads-into-fake-uses -mtriple=x86_64-unknown-linux -debug-only=remove-loads-into-fake-uses 2>&1 -o - | FileCheck %s
+# RUN: llc %s -passes remove-loads-into-fake-uses -mtriple=x86_64-unknown-linux -debug-only=remove-loads-into-fake-uses 2>&1 -o - | FileCheck %s
+
 # REQUIRES: asserts
 #
 ## We verify that:
diff --git a/llvm/test/CodeGen/X86/pr134602.ll b/llvm/test/CodeGen/X86/pr134602.ll
new file mode 100644
index 000000000000..e4376cbeab10
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr134602.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefix=X64
+
+; FIXME: incorrect vector codegen due to bad handling of splats of binops containing undefs
+define i32 @PR134602(i16 %a0) {
+; X86-LABEL: PR134602:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl $1, %eax
+; X86-NEXT:    addl $3, %eax
+; X86-NEXT:    cwtl
+; X86-NEXT:    retl
+;
+; X64-LABEL: PR134602:
+; X64:       # %bb.0:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+  %splat= insertelement <4 x i16> zeroinitializer, i16 %a0, i64 0
+  %mul = mul <4 x i16> %splat, <i16 1, i16 1, i16 0, i16 0>
+  %or = or <4 x i16> splat (i16 1), %mul
+  %reduce = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %or)
+  %ret_32 = sext i16 %reduce to i32
+  ret i32 %ret_32
+}
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_promote.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_promote.s
index b289a4c9aa65..c6782a6c9643 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_promote.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_promote.s
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --sort --version 5
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --unique --sort --version 5
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=_e32 %s
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=_e32 %s
 
@@ -56,14 +56,23 @@ v_add_f16 v5.l, v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 v_add_f16 v5.l, v255.l, v2.l quad_perm:[3,2,1,0]
 // GFX11: v_add_f16_e64_dpp v5.l, v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x32,0xd5,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
-v_fmac_f16 v255, v1, v2
-// GFX11: v_fmac_f16_e64 v255, v1, v2             ; encoding: [0xff,0x00,0x36,0xd5,0x01,0x05,0x02,0x00]
+v_fmac_f16 v255.h, v1.h, v2.h
+// GFX11: v_fmac_f16_e64 v255.h, v1.h, v2.h op_sel:[1,1,1,1] ; encoding: [0xff,0x58,0x36,0xd5,0x01,0x05,0x02,0x00]
 
-v_fmac_f16 v5, v1, v255
-// GFX11: v_fmac_f16_e64 v5, v1, v255             ; encoding: [0x05,0x00,0x36,0xd5,0x01,0xff,0x03,0x00]
+v_fmac_f16 v255.l, v1.l, v2.l
+// GFX11: v_fmac_f16_e64 v255.l, v1.l, v2.l       ; encoding: [0xff,0x00,0x36,0xd5,0x01,0x05,0x02,0x00]
 
-v_fmac_f16 v5, v255, v2
-// GFX11: v_fmac_f16_e64 v5, v255, v2             ; encoding: [0x05,0x00,0x36,0xd5,0xff,0x05,0x02,0x00]
+v_fmac_f16 v5.h, v1.h, v255.h
+// GFX11: v_fmac_f16_e64 v5.h, v1.h, v255.h op_sel:[1,1,1,1] ; encoding: [0x05,0x58,0x36,0xd5,0x01,0xff,0x03,0x00]
+
+v_fmac_f16 v5.h, v255.h, v2.h
+// GFX11: v_fmac_f16_e64 v5.h, v255.h, v2.h op_sel:[1,1,1,1] ; encoding: [0x05,0x58,0x36,0xd5,0xff,0x05,0x02,0x00]
+
+v_fmac_f16 v5.l, v1.l, v255.l
+// GFX11: v_fmac_f16_e64 v5.l, v1.l, v255.l       ; encoding: [0x05,0x00,0x36,0xd5,0x01,0xff,0x03,0x00]
+
+v_fmac_f16 v5.l, v255.l, v2.l
+// GFX11: v_fmac_f16_e64 v5.l, v255.l, v2.l       ; encoding: [0x05,0x00,0x36,0xd5,0xff,0x05,0x02,0x00]
 
 v_ldexp_f16 v255.h, v1.h, v2.h
 // GFX11: v_ldexp_f16_e64 v255.h, v1.h, v2.h op_sel:[1,1,1] ; encoding: [0xff,0x58,0x3b,0xd5,0x01,0x05,0x02,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop2.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop2.s
index 15d6547a0477..cb05bc403778 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop2.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop2.s
@@ -524,47 +524,59 @@ v_cvt_pkrtz_f16_f32_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3
 v_cvt_pkrtz_f16_f32_e64_dpp v255, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cvt_pk_rtz_f16_f32_e64_dpp v255, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x83,0x2f,0xd5,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_fmac_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
-// GFX11: v_fmac_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x36,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_fmac_f16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_fmac_f16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x36,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_fmac_f16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
-// GFX11: v_fmac_f16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x36,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_fmac_f16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX11: v_fmac_f16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x36,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_fmac_f16_e64_dpp v5, v1, v2 row_mirror
-// GFX11: v_fmac_f16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x36,0xd5,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_fmac_f16_e64_dpp v5.l, v1.l, v2.l row_mirror
+// GFX11: v_fmac_f16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x36,0xd5,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_fmac_f16_e64_dpp v5, v1, v2 row_half_mirror
-// GFX11: v_fmac_f16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x36,0xd5,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_fmac_f16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
+// GFX11: v_fmac_f16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x36,0xd5,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_fmac_f16_e64_dpp v5, v1, v2 row_shl:1
-// GFX11: v_fmac_f16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x36,0xd5,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_fmac_f16_e64_dpp v5.l, v1.l, v2.l row_shl:1
+// GFX11: v_fmac_f16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x36,0xd5,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_fmac_f16_e64_dpp v5, v1, v2 row_shl:15
-// GFX11: v_fmac_f16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x36,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_fmac_f16_e64_dpp v5.l, v1.l, v2.l row_shl:15
+// GFX11: v_fmac_f16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x36,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_fmac_f16_e64_dpp v5, v1, v2 row_shr:1
-// GFX11: v_fmac_f16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x36,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_fmac_f16_e64_dpp v5.l, v1.l, v2.l row_shr:1
+// GFX11: v_fmac_f16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x36,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_fmac_f16_e64_dpp v5, v1, v2 row_shr:15
-// GFX11: v_fmac_f16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x36,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_fmac_f16_e64_dpp v5.l, v1.l, v2.l row_shr:15
+// GFX11: v_fmac_f16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x36,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_fmac_f16_e64_dpp v5, v1, v2 row_ror:1
-// GFX11: v_fmac_f16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x36,0xd5,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_fmac_f16_e64_dpp v5.l, v1.l, v2.l row_ror:1
+// GFX11: v_fmac_f16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x36,0xd5,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_fmac_f16_e64_dpp v5, v1, v2 row_ror:15
-// GFX11: v_fmac_f16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x36,0xd5,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_fmac_f16_e64_dpp v5.l, v1.l, v2.l row_ror:15
+// GFX11: v_fmac_f16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x36,0xd5,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_fmac_f16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_fmac_f16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x36,0xd5,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_fmac_f16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_fmac_f16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x36,0xd5,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_fmac_f16_e64_dpp v5, |v1|, -v2 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_fmac_f16_e64_dpp v5, |v1|, -v2 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x36,0xd5,0xfa,0x04,0x02,0x48,0x01,0x5f,0x01,0x01]
+v_fmac_f16_e64_dpp v5.l, |v1.l|, -v2.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_fmac_f16_e64_dpp v5.l, |v1.l|, -v2.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x36,0xd5,0xfa,0x04,0x02,0x48,0x01,0x5f,0x01,0x01]
 
-v_fmac_f16_e64_dpp v5, -v1, |v2| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_fmac_f16_e64_dpp v5, -v1, |v2| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x36,0xd5,0xfa,0x04,0x02,0x30,0x01,0x60,0x09,0x13]
+v_fmac_f16_e64_dpp v5.l, -v1.l, |v2.l| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_fmac_f16_e64_dpp v5.l, -v1.l, |v2.l| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x36,0xd5,0xfa,0x04,0x02,0x30,0x01,0x60,0x09,0x13]
 
-v_fmac_f16_e64_dpp v255, -|v255|, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_fmac_f16_e64_dpp v255, -|v255|, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x83,0x36,0xd5,0xfa,0xfe,0x03,0x78,0xff,0x6f,0x05,0x30]
+v_fmac_f16_e64_dpp v255.l, -|v255.l|, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_fmac_f16_e64_dpp v255.l, -|v255.l|, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x83,0x36,0xd5,0xfa,0xfe,0x03,0x78,0xff,0x6f,0x05,0x30]
+
+v_fmac_f16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_fmac_f16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x36,0xd5,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+
+v_fmac_f16_e64_dpp v5.l, |v1.h|, -v2.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_fmac_f16_e64_dpp v5.l, |v1.h|, -v2.l op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x09,0x36,0xd5,0xfa,0x04,0x02,0x48,0x01,0x5f,0x01,0x01]
+
+v_fmac_f16_e64_dpp v5.l, -v1.l, |v2.h| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_fmac_f16_e64_dpp v5.l, -v1.l, |v2.h| op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x12,0x36,0xd5,0xfa,0x04,0x02,0x30,0x01,0x60,0x09,0x13]
+
+v_fmac_f16_e64_dpp v255.h, -|v255.l|, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_fmac_f16_e64_dpp v255.h, -|v255.l|, -|v255.l| op_sel:[0,0,1,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc3,0x36,0xd5,0xfa,0xfe,0x03,0x78,0xff,0x6f,0x05,0x30]
 
 v_fmac_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
 // GFX11: v_fmac_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop2.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop2.s
index 2f52b7f467e0..35bbca0b6d0b 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop2.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop2.s
@@ -169,17 +169,29 @@ v_cvt_pkrtz_f16_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cvt_pkrtz_f16_f32_e64_dpp v255, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cvt_pk_rtz_f16_f32_e64_dpp v255, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x83,0x2f,0xd5,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_fmac_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fmac_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x36,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_fmac_f16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fmac_f16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x36,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_fmac_f16_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fmac_f16_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x36,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
+v_fmac_f16_e64_dpp v5.l, |v1.l|, -v2.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fmac_f16_e64_dpp v5.l, |v1.l|, -v2.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x36,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 
-v_fmac_f16_e64_dpp v5, -v1, |v2| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_fmac_f16_e64_dpp v5, -v1, |v2| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x36,0xd5,0xea,0x04,0x02,0x30,0x01,0x77,0x39,0x05]
+v_fmac_f16_e64_dpp v5.l, -v1.l, |v2.l| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_fmac_f16_e64_dpp v5.l, -v1.l, |v2.l| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x36,0xd5,0xea,0x04,0x02,0x30,0x01,0x77,0x39,0x05]
 
-v_fmac_f16_e64_dpp v255, -|v255|, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_fmac_f16_e64_dpp v255, -|v255|, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x83,0x36,0xd5,0xe9,0xfe,0x03,0x78,0xff,0x00,0x00,0x00]
+v_fmac_f16_e64_dpp v255.l, -|v255.l|, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_fmac_f16_e64_dpp v255.l, -|v255.l|, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x83,0x36,0xd5,0xe9,0xfe,0x03,0x78,0xff,0x00,0x00,0x00]
+
+v_fmac_f16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fmac_f16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x36,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_fmac_f16_e64_dpp v5.l, |v1.h|, -v2.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fmac_f16_e64_dpp v5.l, |v1.h|, -v2.l op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x09,0x36,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
+
+v_fmac_f16_e64_dpp v5.l, -v1.l, |v2.h| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_fmac_f16_e64_dpp v5.l, -v1.l, |v2.h| op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x12,0x36,0xd5,0xea,0x04,0x02,0x30,0x01,0x77,0x39,0x05]
+
+v_fmac_f16_e64_dpp v255.h, -|v255.l|, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_fmac_f16_e64_dpp v255.h, -|v255.l|, -|v255.l| op_sel:[0,0,1,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc3,0x36,0xd5,0xe9,0xfe,0x03,0x78,0xff,0x00,0x00,0x00]
 
 v_fmac_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_fmac_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop2.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop2.s
index 5debd064812c..4f410f611c8e 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop2.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop2.s
@@ -587,50 +587,59 @@ v_fmac_dx9_zero_f32_e64 v5, -src_scc, |vcc_lo| mul:4
 v_fmac_dx9_zero_f32_e64 v255, -|0xaf123456|, -|vcc_hi| clamp div:2
 // GFX11: v_fmac_dx9_zero_f32_e64 v255, -|0xaf123456|, -|vcc_hi| clamp div:2 ; encoding: [0xff,0x83,0x06,0xd5,0xff,0xd6,0x00,0x78,0x56,0x34,0x12,0xaf]
 
-v_fmac_f16_e64 v5, v1, v2
-// GFX11: v_fmac_f16_e64 v5, v1, v2               ; encoding: [0x05,0x00,0x36,0xd5,0x01,0x05,0x02,0x00]
+v_fmac_f16_e64 v5.l, v1.l, v2.l
+// GFX11: v_fmac_f16_e64 v5.l, v1.l, v2.l         ; encoding: [0x05,0x00,0x36,0xd5,0x01,0x05,0x02,0x00]
 
-v_fmac_f16_e64 v5, v255, v255
-// GFX11: v_fmac_f16_e64 v5, v255, v255           ; encoding: [0x05,0x00,0x36,0xd5,0xff,0xff,0x03,0x00]
+v_fmac_f16_e64 v5.l, v255.l, v255.l
+// GFX11: v_fmac_f16_e64 v5.l, v255.l, v255.l     ; encoding: [0x05,0x00,0x36,0xd5,0xff,0xff,0x03,0x00]
 
-v_fmac_f16_e64 v5, s1, s2
-// GFX11: v_fmac_f16_e64 v5, s1, s2               ; encoding: [0x05,0x00,0x36,0xd5,0x01,0x04,0x00,0x00]
+v_fmac_f16_e64 v5.l, s1, s2
+// GFX11: v_fmac_f16_e64 v5.l, s1, s2             ; encoding: [0x05,0x00,0x36,0xd5,0x01,0x04,0x00,0x00]
 
-v_fmac_f16_e64 v5, s105, s105
-// GFX11: v_fmac_f16_e64 v5, s105, s105           ; encoding: [0x05,0x00,0x36,0xd5,0x69,0xd2,0x00,0x00]
+v_fmac_f16_e64 v5.l, s105, s105
+// GFX11: v_fmac_f16_e64 v5.l, s105, s105         ; encoding: [0x05,0x00,0x36,0xd5,0x69,0xd2,0x00,0x00]
 
-v_fmac_f16_e64 v5, vcc_lo, ttmp15
-// GFX11: v_fmac_f16_e64 v5, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x36,0xd5,0x6a,0xf6,0x00,0x00]
+v_fmac_f16_e64 v5.l, vcc_lo, ttmp15
+// GFX11: v_fmac_f16_e64 v5.l, vcc_lo, ttmp15     ; encoding: [0x05,0x00,0x36,0xd5,0x6a,0xf6,0x00,0x00]
 
-v_fmac_f16_e64 v5, vcc_hi, 0xfe0b
-// GFX11: v_fmac_f16_e64 v5, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x36,0xd5,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+v_fmac_f16_e64 v5.l, vcc_hi, 0xfe0b
+// GFX11: v_fmac_f16_e64 v5.l, vcc_hi, 0xfe0b     ; encoding: [0x05,0x00,0x36,0xd5,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-v_fmac_f16_e64 v5, ttmp15, src_scc
-// GFX11: v_fmac_f16_e64 v5, ttmp15, src_scc      ; encoding: [0x05,0x00,0x36,0xd5,0x7b,0xfa,0x01,0x00]
+v_fmac_f16_e64 v5.l, ttmp15, src_scc
+// GFX11: v_fmac_f16_e64 v5.l, ttmp15, src_scc    ; encoding: [0x05,0x00,0x36,0xd5,0x7b,0xfa,0x01,0x00]
 
-v_fmac_f16_e64 v5, m0, 0.5
-// GFX11: v_fmac_f16_e64 v5, m0, 0.5              ; encoding: [0x05,0x00,0x36,0xd5,0x7d,0xe0,0x01,0x00]
+v_fmac_f16_e64 v5.l, m0, 0.5
+// GFX11: v_fmac_f16_e64 v5.l, m0, 0.5            ; encoding: [0x05,0x00,0x36,0xd5,0x7d,0xe0,0x01,0x00]
 
-v_fmac_f16_e64 v5, exec_lo, -1
-// GFX11: v_fmac_f16_e64 v5, exec_lo, -1          ; encoding: [0x05,0x00,0x36,0xd5,0x7e,0x82,0x01,0x00]
+v_fmac_f16_e64 v5.l, exec_lo, -1
+// GFX11: v_fmac_f16_e64 v5.l, exec_lo, -1        ; encoding: [0x05,0x00,0x36,0xd5,0x7e,0x82,0x01,0x00]
 
-v_fmac_f16_e64 v5, |exec_hi|, null
-// GFX11: v_fmac_f16_e64 v5, |exec_hi|, null      ; encoding: [0x05,0x01,0x36,0xd5,0x7f,0xf8,0x00,0x00]
+v_fmac_f16_e64 v5.l, |exec_hi|, null
+// GFX11: v_fmac_f16_e64 v5.l, |exec_hi|, null    ; encoding: [0x05,0x01,0x36,0xd5,0x7f,0xf8,0x00,0x00]
 
-v_fmac_f16_e64 v5, null, exec_lo
-// GFX11: v_fmac_f16_e64 v5, null, exec_lo        ; encoding: [0x05,0x00,0x36,0xd5,0x7c,0xfc,0x00,0x00]
+v_fmac_f16_e64 v5.l, null, exec_lo
+// GFX11: v_fmac_f16_e64 v5.l, null, exec_lo      ; encoding: [0x05,0x00,0x36,0xd5,0x7c,0xfc,0x00,0x00]
 
-v_fmac_f16_e64 v5, -1, exec_hi
-// GFX11: v_fmac_f16_e64 v5, -1, exec_hi          ; encoding: [0x05,0x00,0x36,0xd5,0xc1,0xfe,0x00,0x00]
+v_fmac_f16_e64 v5.l, -1, exec_hi
+// GFX11: v_fmac_f16_e64 v5.l, -1, exec_hi        ; encoding: [0x05,0x00,0x36,0xd5,0xc1,0xfe,0x00,0x00]
 
-v_fmac_f16_e64 v5, 0.5, -m0 mul:2
-// GFX11: v_fmac_f16_e64 v5, 0.5, -m0 mul:2       ; encoding: [0x05,0x00,0x36,0xd5,0xf0,0xfa,0x00,0x48]
+v_fmac_f16_e64 v5.l, 0.5, -m0 mul:2
+// GFX11: v_fmac_f16_e64 v5.l, 0.5, -m0 mul:2     ; encoding: [0x05,0x00,0x36,0xd5,0xf0,0xfa,0x00,0x48]
 
-v_fmac_f16_e64 v5, -src_scc, |vcc_lo| mul:4
-// GFX11: v_fmac_f16_e64 v5, -src_scc, |vcc_lo| mul:4 ; encoding: [0x05,0x02,0x36,0xd5,0xfd,0xd4,0x00,0x30]
+v_fmac_f16_e64 v5.l, -src_scc, |vcc_lo| mul:4
+// GFX11: v_fmac_f16_e64 v5.l, -src_scc, |vcc_lo| mul:4 ; encoding: [0x05,0x02,0x36,0xd5,0xfd,0xd4,0x00,0x30]
 
-v_fmac_f16_e64 v255, -|0xfe0b|, -|vcc_hi| clamp div:2
-// GFX11: v_fmac_f16_e64 v255, -|0xfe0b|, -|vcc_hi| clamp div:2 ; encoding: [0xff,0x83,0x36,0xd5,0xff,0xd6,0x00,0x78,0x0b,0xfe,0x00,0x00]
+v_fmac_f16_e64 v255.l, -|0xfe0b|, -|vcc_hi| clamp div:2
+// GFX11: v_fmac_f16_e64 v255.l, -|0xfe0b|, -|vcc_hi| clamp div:2 ; encoding: [0xff,0x83,0x36,0xd5,0xff,0xd6,0x00,0x78,0x0b,0xfe,0x00,0x00]
+
+v_fmac_f16_e64 v5.l, v1.h, v2.l
+// GFX11: v_fmac_f16_e64 v5.l, v1.h, v2.l op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x36,0xd5,0x01,0x05,0x02,0x00]
+
+v_fmac_f16_e64 v5.l, v255.l, v255.h
+// GFX11: v_fmac_f16_e64 v5.l, v255.l, v255.h op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x36,0xd5,0xff,0xff,0x03,0x00]
+
+v_fmac_f16_e64 v255.h, -|0xfe0b|, -|vcc_hi| clamp div:2
+// GFX11: v_fmac_f16_e64 v255.h, -|0xfe0b|, -|vcc_hi| op_sel:[0,0,1,1] clamp div:2 ; encoding: [0xff,0xc3,0x36,0xd5,0xff,0xd6,0x00,0x78,0x0b,0xfe,0x00,0x00]
 
 v_fmac_f32_e64 v5, v1, v2
 // GFX11: v_fmac_f32_e64 v5, v1, v2               ; encoding: [0x05,0x00,0x2b,0xd5,0x01,0x05,0x02,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop2_t16_promote.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_t16_promote.s
index e1286dadfb6e..296c42f4c2c6 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop2_t16_promote.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_t16_promote.s
@@ -55,14 +55,23 @@ v_add_f16 v5.l, v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 v_add_f16 v5.l, v255.l, v2.l quad_perm:[3,2,1,0]
 // GFX12: v_add_f16_e64_dpp v5.l, v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x32,0xd5,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
-v_fmac_f16 v255, v1, v2
-// GFX12: v_fmac_f16_e64 v255, v1, v2             ; encoding: [0xff,0x00,0x36,0xd5,0x01,0x05,0x02,0x00]
+v_fmac_f16 v255.h, v1.h, v2.h
+// GFX12: v_fmac_f16_e64 v255.h, v1.h, v2.h op_sel:[1,1,1,1] ; encoding: [0xff,0x58,0x36,0xd5,0x01,0x05,0x02,0x00]
 
-v_fmac_f16 v5, v1, v255
-// GFX12: v_fmac_f16_e64 v5, v1, v255             ; encoding: [0x05,0x00,0x36,0xd5,0x01,0xff,0x03,0x00]
+v_fmac_f16 v255.l, v1.l, v2.l
+// GFX12: v_fmac_f16_e64 v255.l, v1.l, v2.l       ; encoding: [0xff,0x00,0x36,0xd5,0x01,0x05,0x02,0x00]
 
-v_fmac_f16 v5, v255, v2
-// GFX12: v_fmac_f16_e64 v5, v255, v2             ; encoding: [0x05,0x00,0x36,0xd5,0xff,0x05,0x02,0x00]
+v_fmac_f16 v5.h, v1.h, v255.h
+// GFX12: v_fmac_f16_e64 v5.h, v1.h, v255.h op_sel:[1,1,1,1] ; encoding: [0x05,0x58,0x36,0xd5,0x01,0xff,0x03,0x00]
+
+v_fmac_f16 v5.h, v255.h, v2.h
+// GFX12: v_fmac_f16_e64 v5.h, v255.h, v2.h op_sel:[1,1,1,1] ; encoding: [0x05,0x58,0x36,0xd5,0xff,0x05,0x02,0x00]
+
+v_fmac_f16 v5.l, v1.l, v255.l
+// GFX12: v_fmac_f16_e64 v5.l, v1.l, v255.l       ; encoding: [0x05,0x00,0x36,0xd5,0x01,0xff,0x03,0x00]
+
+v_fmac_f16 v5.l, v255.l, v2.l
+// GFX12: v_fmac_f16_e64 v5.l, v255.l, v2.l       ; encoding: [0x05,0x00,0x36,0xd5,0xff,0x05,0x02,0x00]
 
 v_ldexp_f16 v255.h, v1.h, v2.h
 // GFX12: v_ldexp_f16_e64 v255.h, v1.h, v2.h op_sel:[1,1,1] ; encoding: [0xff,0x58,0x3b,0xd5,0x01,0x05,0x02,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2.s
index 7674eafd9c30..f16838919a4a 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2.s
@@ -578,50 +578,59 @@ v_cvt_pkrtz_f16_f32_e64 v5, -src_scc, |vcc_lo|
 v_cvt_pkrtz_f16_f32_e64 v255, -|0xaf123456|, -|vcc_hi| clamp
 // GFX12: v_cvt_pk_rtz_f16_f32_e64 v255, -|0xaf123456|, -|vcc_hi| clamp ; encoding: [0xff,0x83,0x2f,0xd5,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-v_fmac_f16_e64 v5, v1, v2
-// GFX12: v_fmac_f16_e64 v5, v1, v2               ; encoding: [0x05,0x00,0x36,0xd5,0x01,0x05,0x02,0x00]
+v_fmac_f16_e64 v5.l, v1.l, v2.l
+// GFX12: v_fmac_f16_e64 v5.l, v1.l, v2.l         ; encoding: [0x05,0x00,0x36,0xd5,0x01,0x05,0x02,0x00]
 
-v_fmac_f16_e64 v5, v255, v255
-// GFX12: v_fmac_f16_e64 v5, v255, v255           ; encoding: [0x05,0x00,0x36,0xd5,0xff,0xff,0x03,0x00]
+v_fmac_f16_e64 v5.l, v255.l, v255.l
+// GFX12: v_fmac_f16_e64 v5.l, v255.l, v255.l     ; encoding: [0x05,0x00,0x36,0xd5,0xff,0xff,0x03,0x00]
 
-v_fmac_f16_e64 v5, s1, s2
-// GFX12: v_fmac_f16_e64 v5, s1, s2               ; encoding: [0x05,0x00,0x36,0xd5,0x01,0x04,0x00,0x00]
+v_fmac_f16_e64 v5.l, s1, s2
+// GFX12: v_fmac_f16_e64 v5.l, s1, s2             ; encoding: [0x05,0x00,0x36,0xd5,0x01,0x04,0x00,0x00]
 
-v_fmac_f16_e64 v5, s105, s105
-// GFX12: v_fmac_f16_e64 v5, s105, s105           ; encoding: [0x05,0x00,0x36,0xd5,0x69,0xd2,0x00,0x00]
+v_fmac_f16_e64 v5.l, s105, s105
+// GFX12: v_fmac_f16_e64 v5.l, s105, s105         ; encoding: [0x05,0x00,0x36,0xd5,0x69,0xd2,0x00,0x00]
 
-v_fmac_f16_e64 v5, vcc_lo, ttmp15
-// GFX12: v_fmac_f16_e64 v5, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x36,0xd5,0x6a,0xf6,0x00,0x00]
+v_fmac_f16_e64 v5.l, vcc_lo, ttmp15
+// GFX12: v_fmac_f16_e64 v5.l, vcc_lo, ttmp15     ; encoding: [0x05,0x00,0x36,0xd5,0x6a,0xf6,0x00,0x00]
 
-v_fmac_f16_e64 v5, vcc_hi, 0xfe0b
-// GFX12: v_fmac_f16_e64 v5, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x36,0xd5,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+v_fmac_f16_e64 v5.l, vcc_hi, 0xfe0b
+// GFX12: v_fmac_f16_e64 v5.l, vcc_hi, 0xfe0b     ; encoding: [0x05,0x00,0x36,0xd5,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-v_fmac_f16_e64 v5, ttmp15, src_scc
-// GFX12: v_fmac_f16_e64 v5, ttmp15, src_scc      ; encoding: [0x05,0x00,0x36,0xd5,0x7b,0xfa,0x01,0x00]
+v_fmac_f16_e64 v5.l, ttmp15, src_scc
+// GFX12: v_fmac_f16_e64 v5.l, ttmp15, src_scc    ; encoding: [0x05,0x00,0x36,0xd5,0x7b,0xfa,0x01,0x00]
 
-v_fmac_f16_e64 v5, m0, 0.5
-// GFX12: v_fmac_f16_e64 v5, m0, 0.5              ; encoding: [0x05,0x00,0x36,0xd5,0x7d,0xe0,0x01,0x00]
+v_fmac_f16_e64 v5.l, m0, 0.5
+// GFX12: v_fmac_f16_e64 v5.l, m0, 0.5            ; encoding: [0x05,0x00,0x36,0xd5,0x7d,0xe0,0x01,0x00]
 
-v_fmac_f16_e64 v5, exec_lo, -1
-// GFX12: v_fmac_f16_e64 v5, exec_lo, -1          ; encoding: [0x05,0x00,0x36,0xd5,0x7e,0x82,0x01,0x00]
+v_fmac_f16_e64 v5.l, exec_lo, -1
+// GFX12: v_fmac_f16_e64 v5.l, exec_lo, -1        ; encoding: [0x05,0x00,0x36,0xd5,0x7e,0x82,0x01,0x00]
 
-v_fmac_f16_e64 v5, |exec_hi|, null
-// GFX12: v_fmac_f16_e64 v5, |exec_hi|, null      ; encoding: [0x05,0x01,0x36,0xd5,0x7f,0xf8,0x00,0x00]
+v_fmac_f16_e64 v5.l, |exec_hi|, null
+// GFX12: v_fmac_f16_e64 v5.l, |exec_hi|, null    ; encoding: [0x05,0x01,0x36,0xd5,0x7f,0xf8,0x00,0x00]
 
-v_fmac_f16_e64 v5, null, exec_lo
-// GFX12: v_fmac_f16_e64 v5, null, exec_lo        ; encoding: [0x05,0x00,0x36,0xd5,0x7c,0xfc,0x00,0x00]
+v_fmac_f16_e64 v5.l, null, exec_lo
+// GFX12: v_fmac_f16_e64 v5.l, null, exec_lo      ; encoding: [0x05,0x00,0x36,0xd5,0x7c,0xfc,0x00,0x00]
 
-v_fmac_f16_e64 v5, -1, exec_hi
-// GFX12: v_fmac_f16_e64 v5, -1, exec_hi          ; encoding: [0x05,0x00,0x36,0xd5,0xc1,0xfe,0x00,0x00]
+v_fmac_f16_e64 v5.l, -1, exec_hi
+// GFX12: v_fmac_f16_e64 v5.l, -1, exec_hi        ; encoding: [0x05,0x00,0x36,0xd5,0xc1,0xfe,0x00,0x00]
 
-v_fmac_f16_e64 v5, 0.5, -m0 mul:2
-// GFX12: v_fmac_f16_e64 v5, 0.5, -m0 mul:2       ; encoding: [0x05,0x00,0x36,0xd5,0xf0,0xfa,0x00,0x48]
+v_fmac_f16_e64 v5.l, 0.5, -m0 mul:2
+// GFX12: v_fmac_f16_e64 v5.l, 0.5, -m0 mul:2     ; encoding: [0x05,0x00,0x36,0xd5,0xf0,0xfa,0x00,0x48]
 
-v_fmac_f16_e64 v5, -src_scc, |vcc_lo| mul:4
-// GFX12: v_fmac_f16_e64 v5, -src_scc, |vcc_lo| mul:4 ; encoding: [0x05,0x02,0x36,0xd5,0xfd,0xd4,0x00,0x30]
+v_fmac_f16_e64 v5.l, -src_scc, |vcc_lo| mul:4
+// GFX12: v_fmac_f16_e64 v5.l, -src_scc, |vcc_lo| mul:4 ; encoding: [0x05,0x02,0x36,0xd5,0xfd,0xd4,0x00,0x30]
 
-v_fmac_f16_e64 v255, -|0xfe0b|, -|vcc_hi| clamp div:2
-// GFX12: v_fmac_f16_e64 v255, -|0xfe0b|, -|vcc_hi| clamp div:2 ; encoding: [0xff,0x83,0x36,0xd5,0xff,0xd6,0x00,0x78,0x0b,0xfe,0x00,0x00]
+v_fmac_f16_e64 v255.l, -|0xfe0b|, -|vcc_hi| clamp div:2
+// GFX12: v_fmac_f16_e64 v255.l, -|0xfe0b|, -|vcc_hi| clamp div:2 ; encoding: [0xff,0x83,0x36,0xd5,0xff,0xd6,0x00,0x78,0x0b,0xfe,0x00,0x00]
+
+v_fmac_f16_e64 v5.l, v1.h, v2.l
+// GFX12: v_fmac_f16_e64 v5.l, v1.h, v2.l op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x36,0xd5,0x01,0x05,0x02,0x00]
+
+v_fmac_f16_e64 v5.l, v255.l, v255.h
+// GFX12: v_fmac_f16_e64 v5.l, v255.l, v255.h op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x36,0xd5,0xff,0xff,0x03,0x00]
+
+v_fmac_f16_e64 v255.h, -|0xfe0b|, -|vcc_hi| clamp div:2
+// GFX12: v_fmac_f16_e64 v255.h, -|0xfe0b|, -|vcc_hi| op_sel:[0,0,1,1] clamp div:2 ; encoding: [0xff,0xc3,0x36,0xd5,0xff,0xd6,0x00,0x78,0x0b,0xfe,0x00,0x00]
 
 v_fmac_f32_e64 v5, v1, v2
 // GFX12: v_fmac_f32_e64 v5, v1, v2               ; encoding: [0x05,0x00,0x2b,0xd5,0x01,0x05,0x02,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp16.s
index e27706ec02ea..301f756e906a 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp16.s
@@ -566,6 +566,48 @@ v_cvt_pkrtz_f16_f32_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3
 v_cvt_pkrtz_f16_f32_e64_dpp v255, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_cvt_pk_rtz_f16_f32_e64_dpp v255, -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x83,0x2f,0xd5,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
+v_fmac_f16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_fmac_f16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x36,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+
+v_fmac_f16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: v_fmac_f16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x36,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+
+v_fmac_f16_e64_dpp v5.l, v1.l, v2.l row_mirror
+// GFX12: v_fmac_f16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x36,0xd5,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+
+v_fmac_f16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
+// GFX12: v_fmac_f16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x36,0xd5,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+
+v_fmac_f16_e64_dpp v5.l, v1.l, v2.l row_shl:1
+// GFX12: v_fmac_f16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x36,0xd5,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+
+v_fmac_f16_e64_dpp v5.l, v1.l, v2.l row_shl:15
+// GFX12: v_fmac_f16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x36,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+
+v_fmac_f16_e64_dpp v5.l, v1.l, v2.l row_shr:1
+// GFX12: v_fmac_f16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x36,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+
+v_fmac_f16_e64_dpp v5.l, v1.l, v2.l row_shr:15
+// GFX12: v_fmac_f16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x36,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+
+v_fmac_f16_e64_dpp v5.l, v1.l, v2.l row_ror:1
+// GFX12: v_fmac_f16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x36,0xd5,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+
+v_fmac_f16_e64_dpp v5.l, v1.l, v2.l row_ror:15
+// GFX12: v_fmac_f16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x36,0xd5,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+
+v_fmac_f16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_fmac_f16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x36,0xd5,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+
+v_fmac_f16_e64_dpp v5.l, |v1.h|, -v2.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_fmac_f16_e64_dpp v5.l, |v1.h|, -v2.l op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x09,0x36,0xd5,0xfa,0x04,0x02,0x48,0x01,0x5f,0x01,0x01]
+
+v_fmac_f16_e64_dpp v5.l, -v1.l, |v2.h| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_fmac_f16_e64_dpp v5.l, -v1.l, |v2.h| op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x12,0x36,0xd5,0xfa,0x04,0x02,0x30,0x01,0x60,0x09,0x13]
+
+v_fmac_f16_e64_dpp v255.h, -|v255.l|, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_fmac_f16_e64_dpp v255.h, -|v255.l|, -|v255.l| op_sel:[0,0,1,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc3,0x36,0xd5,0xfa,0xfe,0x03,0x78,0xff,0x6f,0x05,0x30]
+
 v_ldexp_f16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
 // GFX12: v_ldexp_f16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp8.s
index 49233697e955..7390720e4dd5 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp8.s
@@ -239,6 +239,21 @@ v_cvt_pkrtz_f16_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cvt_pkrtz_f16_f32_e64_dpp v255, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_cvt_pk_rtz_f16_f32_e64_dpp v255, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x83,0x2f,0xd5,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
+v_fmac_f16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fmac_f16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x36,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_fmac_f16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fmac_f16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x36,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_fmac_f16_e64_dpp v5.l, |v1.h|, -v2.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fmac_f16_e64_dpp v5.l, |v1.h|, -v2.l op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x09,0x36,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
+
+v_fmac_f16_e64_dpp v5.l, -v1.l, |v2.h| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_fmac_f16_e64_dpp v5.l, -v1.l, |v2.h| op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x12,0x36,0xd5,0xea,0x04,0x02,0x30,0x01,0x77,0x39,0x05]
+
+v_fmac_f16_e64_dpp v255.h, -|v255.l|, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_fmac_f16_e64_dpp v255.h, -|v255.l|, -|v255.l| op_sel:[0,0,1,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc3,0x36,0xd5,0xe9,0xfe,0x03,0x78,0xff,0x00,0x00,0x00]
+
 v_ldexp_f16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_ldexp_f16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3b,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
diff --git a/llvm/test/MC/Disassembler/RISCV/emit-x8-as-fp.txt b/llvm/test/MC/Disassembler/RISCV/emit-x8-as-fp.txt
new file mode 100644
index 000000000000..23d3f72c2ef5
--- /dev/null
+++ b/llvm/test/MC/Disassembler/RISCV/emit-x8-as-fp.txt
@@ -0,0 +1,22 @@
+# RUN: llvm-mc --disassemble -triple=riscv32 --show-encoding < %s \
+# RUN:   | FileCheck --check-prefixes=DEFAULT %s
+# RUN: llvm-mc --disassemble -triple=riscv64 --show-encoding < %s \
+# RUN:   | FileCheck --check-prefixes=DEFAULT %s
+# RUN: llvm-mc --disassemble -triple=riscv32 -M emit-x8-as-fp \
+# RUN:   --show-encoding < %s | FileCheck --check-prefixes=EMIT-FP %s
+# RUN: llvm-mc --disassemble -triple=riscv64 -M emit-x8-as-fp \
+# RUN:   --show-encoding < %s | FileCheck --check-prefixes=EMIT-FP %s
+# RUN: llvm-mc --disassemble -triple=riscv32 -M numeric -M emit-x8-as-fp \
+# RUN:   --show-encoding < %s | FileCheck --check-prefixes=NUMERIC %s
+# RUN: llvm-mc --disassemble -triple=riscv64 -M numeric -M emit-x8-as-fp \
+# RUN:   --show-encoding < %s | FileCheck --check-prefixes=NUMERIC %s
+
+# DEFAULT: sw a0, -12(s0)  # encoding: [0x23,0x2a,0xa4,0xfe]
+# EMIT-FP: sw a0, -12(fp)  # encoding: [0x23,0x2a,0xa4,0xfe]
+# NUMERIC: sw x10, -12(x8) # encoding: [0x23,0x2a,0xa4,0xfe]
+0x23 0x2a 0xa4 0xfe
+
+# DEFAULT: lw a0, -12(s0)  # encoding: [0x03,0x25,0x44,0xff]
+# EMIT-FP: lw a0, -12(fp)  # encoding: [0x03,0x25,0x44,0xff]
+# NUMERIC: lw x10, -12(x8) # encoding: [0x03,0x25,0x44,0xff]
+0x03 0x25 0x44 0xff
diff --git a/llvm/test/MC/RISCV/emit-x8-as-fp.s b/llvm/test/MC/RISCV/emit-x8-as-fp.s
new file mode 100644
index 000000000000..ea5143f5066f
--- /dev/null
+++ b/llvm/test/MC/RISCV/emit-x8-as-fp.s
@@ -0,0 +1,42 @@
+# RUN: llvm-mc --triple=riscv32 --show-encoding < %s 2>&1 \
+# RUN:   | FileCheck --check-prefix=DEFAULT %s
+# RUN: llvm-mc --triple=riscv64 --show-encoding < %s 2>&1 \
+# RUN:   | FileCheck --check-prefix=DEFAULT %s
+# RUN: llvm-mc --triple=riscv32 -M emit-x8-as-fp --show-encoding < %s 2>&1 \
+# RUN:   | FileCheck --check-prefix=EMIT-FP %s
+# RUN: llvm-mc --triple=riscv64 -M emit-x8-as-fp --show-encoding < %s 2>&1 \
+# RUN:   | FileCheck --check-prefix=EMIT-FP %s
+# RUN: llvm-mc --triple=riscv32 -M numeric -M emit-x8-as-fp --show-encoding \
+# RUN:   < %s 2>&1 | FileCheck --check-prefix=NUMERIC %s
+# RUN: llvm-mc --triple=riscv64 -M numeric -M emit-x8-as-fp --show-encoding \
+# RUN:   < %s 2>&1 | FileCheck --check-prefix=NUMERIC %s
+
+# DEFAULT: sw      a0, -12(s0)                     # encoding: [0x23,0x2a,0xa4,0xfe]
+# EMIT-FP: sw      a0, -12(fp)                     # encoding: [0x23,0x2a,0xa4,0xfe]
+# NUMERIC: sw      x10, -12(x8)                    # encoding: [0x23,0x2a,0xa4,0xfe]
+sw a0, -12(s0)
+
+# DEFAULT: lw      a0, -12(s0)                     # encoding: [0x03,0x25,0x44,0xff]
+# EMIT-FP: lw      a0, -12(fp)                     # encoding: [0x03,0x25,0x44,0xff]
+# NUMERIC: lw      x10, -12(x8)                    # encoding: [0x03,0x25,0x44,0xff]
+lw a0, -12(s0)
+
+# DEFAULT: sw      a0, -12(s0)                     # encoding: [0x23,0x2a,0xa4,0xfe]
+# EMIT-FP: sw      a0, -12(fp)                     # encoding: [0x23,0x2a,0xa4,0xfe]
+# NUMERIC: sw      x10, -12(x8)                    # encoding: [0x23,0x2a,0xa4,0xfe]
+sw a0, -12(fp)
+
+# DEFAULT: lw      a0, -12(s0)                     # encoding: [0x03,0x25,0x44,0xff]
+# EMIT-FP: lw      a0, -12(fp)                     # encoding: [0x03,0x25,0x44,0xff]
+# NUMERIC: lw      x10, -12(x8)                    # encoding: [0x03,0x25,0x44,0xff]
+lw a0, -12(fp)
+
+# DEFAULT: sw      a0, -12(s0)                     # encoding: [0x23,0x2a,0xa4,0xfe]
+# EMIT-FP: sw      a0, -12(fp)                     # encoding: [0x23,0x2a,0xa4,0xfe]
+# NUMERIC: sw      x10, -12(x8)                    # encoding: [0x23,0x2a,0xa4,0xfe]
+sw a0, -12(x8)
+
+# DEFAULT: lw      a0, -12(s0)                     # encoding: [0x03,0x25,0x44,0xff]
+# EMIT-FP: lw      a0, -12(fp)                     # encoding: [0x03,0x25,0x44,0xff]
+# NUMERIC: lw      x10, -12(x8)                    # encoding: [0x03,0x25,0x44,0xff]
+lw a0, -12(x8)
diff --git a/llvm/test/MC/RISCV/function-call-invalid.s b/llvm/test/MC/RISCV/function-call-invalid.s
index 2b7a85245880..17d02015a694 100644
--- a/llvm/test/MC/RISCV/function-call-invalid.s
+++ b/llvm/test/MC/RISCV/function-call-invalid.s
@@ -10,3 +10,5 @@ call %lo(1234) # CHECK: :[[@LINE]]:6: error: operand must be a bare symbol name
 call %hi(foo) # CHECK: :[[@LINE]]:6: error: operand must be a bare symbol name
 call %lo(foo) # CHECK: :[[@LINE]]:6: error: operand must be a bare symbol name
 call foo, bar # CHECK: :[[@LINE]]:6: error: operand must be a bare symbol name
+call foo@pls # CHECK: :[[@LINE]]:10: error: @ (except the deprecated/ignored @plt) is disallowed
+call foo@3 # CHECK: :[[@LINE]]:10: error: @ (except the deprecated/ignored @plt) is disallowed
diff --git a/llvm/test/MC/RISCV/tail-call-invalid.s b/llvm/test/MC/RISCV/tail-call-invalid.s
index 270d84df58ac..14ff996b2e4b 100644
--- a/llvm/test/MC/RISCV/tail-call-invalid.s
+++ b/llvm/test/MC/RISCV/tail-call-invalid.s
@@ -10,3 +10,4 @@ tail %hi(1234) # CHECK: :[[@LINE]]:6: error: operand must be a bare symbol name
 tail %lo(1234) # CHECK: :[[@LINE]]:6: error: operand must be a bare symbol name
 tail %hi(foo) # CHECK: :[[@LINE]]:6: error: operand must be a bare symbol name
 tail %lo(foo) # CHECK: :[[@LINE]]:6: error: operand must be a bare symbol name
+tail foo@pls # CHECK: :[[@LINE]]:10: error: @ (except the deprecated/ignored @plt) is disallowed
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-inline-threshold.ll b/llvm/test/Other/new-pm-lto-prelink-samplepgo-inline-threshold.ll
index 9baedcb02ca0..67af5f18e057 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-inline-threshold.ll
+++ b/llvm/test/Other/new-pm-lto-prelink-samplepgo-inline-threshold.ll
@@ -1,4 +1,4 @@
-; Tests that hot callsite threshold is set to 0 artifically for thinlto-prelink pipeline.
+; Tests that hot callsite threshold is set to 0 artifically for thinlto-prelink and lto-pre-link pipeline.
 ;
 ; Function `sum` is annotated with inline cost -1 and function `sum1` is
 ; annotated with inline cost 0, by function attribute `function-inline-cost`.
@@ -12,6 +12,9 @@
 
 ; RUN: opt < %s -pass-remarks=inline -pass-remarks-missed=inline -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%S/Inputs/new-pm-thinlto-prelink-samplepgo-inline-threshold.prof -S 2>&1 | FileCheck %s -check-prefix=REMARK
 
+; RUN: opt < %s -pass-remarks=inline -pass-remarks-missed=inline -passes='lto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%S/Inputs/new-pm-thinlto-prelink-samplepgo-inline-threshold.prof -S | FileCheck %s
+
+; RUN: opt < %s -pass-remarks=inline -pass-remarks-missed=inline -passes='lto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%S/Inputs/new-pm-thinlto-prelink-samplepgo-inline-threshold.prof -S 2>&1 | FileCheck %s -check-prefix=REMARK
 ; Original C++ test case
 ;
 ; #include <stdio.h>
diff --git a/llvm/test/Transforms/GVN/pr65447.ll b/llvm/test/Transforms/GVN/pr65447.ll
index 1b951e907e82..1fa3811a3a81 100644
--- a/llvm/test/Transforms/GVN/pr65447.ll
+++ b/llvm/test/Transforms/GVN/pr65447.ll
@@ -2,29 +2,98 @@
 ; RUN: opt -S -passes=gvn < %s | FileCheck %s
 
 ; Make sure deduplicated phi nodes are removed from the VN map.
-define i64 @f() {
-; CHECK-LABEL: define i64 @f() {
+define i64 @f2(ptr %arg) {
+; CHECK-LABEL: define i64 @f2(
+; CHECK-SAME: ptr [[ARG:%.*]]) {
+; CHECK-NEXT:  BB:
+; CHECK-NEXT:    store i1 false, ptr [[ARG]], align 1
+; CHECK-NEXT:    br label [[BB2D:%.*]]
+; CHECK:       BB2a:
+; CHECK-NEXT:    br label [[BB2B:%.*]]
+; CHECK:       BB2b:
+; CHECK-NEXT:    br label [[BB2C:%.*]]
+; CHECK:       BB2c:
+; CHECK-NEXT:    [[AZ2:%.*]] = phi i1 [ true, [[BB2B]] ], [ [[AZ:%.*]], [[BB2D]] ]
+; CHECK-NEXT:    [[DOTPHI_TRANS_INSERT:%.*]] = sext i1 [[AZ2]] to i64
+; CHECK-NEXT:    [[GEP2_PHI_TRANS_INSERT:%.*]] = getelementptr i1, ptr [[ARG]], i64 [[DOTPHI_TRANS_INSERT]]
+; CHECK-NEXT:    [[L93_PRE:%.*]] = load i1, ptr [[GEP2_PHI_TRANS_INSERT]], align 1
+; CHECK-NEXT:    br label [[BB2D]]
+; CHECK:       BB2d:
+; CHECK-NEXT:    [[AZ]] = phi i1 [ [[AZ2]], [[BB2C]] ], [ false, [[BB:%.*]] ]
+; CHECK-NEXT:    [[L93:%.*]] = phi i1 [ [[L93_PRE]], [[BB2C]] ], [ false, [[BB]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i1 [[AZ]] to i64
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i1, ptr [[ARG]], i64 [[TMP0]]
+; CHECK-NEXT:    store i1 [[AZ]], ptr [[ARG]], align 2
+; CHECK-NEXT:    br i1 [[L93]], label [[BB2C]], label [[BB1E:%.*]]
+; CHECK:       BB1e:
+; CHECK-NEXT:    br i1 [[AZ]], label [[BB2F:%.*]], label [[BB4:%.*]]
+; CHECK:       BB2f:
+; CHECK-NEXT:    store i1 true, ptr [[ARG]], align 2
+; CHECK-NEXT:    br label [[BB2B]]
+; CHECK:       BB4:
+; CHECK-NEXT:    br label [[BB4]]
+;
+BB:
+  store i1 false, ptr %arg, align 1
+  br label %BB2d
+
+BB2a: ; No predecessors!
+  br label %BB2b
+
+BB2b:       ; preds = %BB2f, %BB2a
+  br label %BB2c
+
+BB2c:                     ; preds = %BB2d, %BB2b
+  %0 = phi i1 [ true, %BB2b ], [ %1, %BB2d ]
+  br label %BB2d
+
+BB2d:                              ; preds = %BB2c, %BB
+  %1 = phi i1 [ %0, %BB2c ], [ false, %BB ]
+  %2 = sext i1 %1 to i64
+  %gep2 = getelementptr i1, ptr %arg, i64 %2
+  %L93 = load i1, ptr %gep2, align 1
+  %Az = load i1, ptr %arg, align 2
+  store i1 %1, ptr %arg, align 2
+  br i1 %L93, label %BB2c, label %BB1e
+
+BB1e:                                        ; preds = %BB2d
+  br i1 %Az, label %BB2f, label %BB4
+
+BB2f:                                ; preds = %BB1e
+  store i1 true, ptr %arg, align 2
+  br label %BB2b
+
+BB4:                                              ; preds = %BB1e, %BB4
+  br label %BB4
+
+; uselistorder directives
+  uselistorder label %BB4, { 1, 0 }
+}
+
+; Make sure deduplicated phi nodes are removed from the VN map. Make
+; sure there is no assert on attempt to use ConstantData use lists.
+define i64 @f_null() {
+; CHECK-LABEL: define i64 @f_null() {
 ; CHECK-NEXT:  BB:
 ; CHECK-NEXT:    store i1 false, ptr null, align 1
 ; CHECK-NEXT:    br label [[BB2D:%.*]]
 ; CHECK:       BB2a:
 ; CHECK-NEXT:    br label [[BB2B:%.*]]
 ; CHECK:       BB2b:
-; CHECK-NEXT:    [[L93_PRE_PRE:%.*]] = load i1, ptr inttoptr (i64 -1 to ptr), align 1
 ; CHECK-NEXT:    br label [[BB2C:%.*]]
 ; CHECK:       BB2c:
-; CHECK-NEXT:    [[L93_PRE:%.*]] = phi i1 [ [[L93_PRE_PRE]], [[BB2B]] ], [ true, [[BB2D]] ]
 ; CHECK-NEXT:    [[AZ2:%.*]] = phi i1 [ true, [[BB2B]] ], [ [[AZ:%.*]], [[BB2D]] ]
 ; CHECK-NEXT:    [[DOTPHI_TRANS_INSERT:%.*]] = sext i1 [[AZ2]] to i64
 ; CHECK-NEXT:    [[GEP2_PHI_TRANS_INSERT:%.*]] = getelementptr i1, ptr null, i64 [[DOTPHI_TRANS_INSERT]]
+; CHECK-NEXT:    [[L93_PRE:%.*]] = load i1, ptr [[GEP2_PHI_TRANS_INSERT]], align 1
 ; CHECK-NEXT:    br label [[BB2D]]
 ; CHECK:       BB2d:
-; CHECK-NEXT:    [[L93_PRE5:%.*]] = phi i1 [ [[L93_PRE]], [[BB2C]] ], [ false, [[BB:%.*]] ]
-; CHECK-NEXT:    [[AZ]] = phi i1 [ [[AZ2]], [[BB2C]] ], [ false, [[BB]] ]
+; CHECK-NEXT:    [[AZ]] = phi i1 [ [[AZ2]], [[BB2C]] ], [ false, [[BB:%.*]] ]
+; CHECK-NEXT:    [[L93:%.*]] = phi i1 [ [[L93_PRE]], [[BB2C]] ], [ false, [[BB]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = sext i1 [[AZ]] to i64
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i1, ptr null, i64 [[TMP0]]
 ; CHECK-NEXT:    store i1 [[AZ]], ptr null, align 2
-; CHECK-NEXT:    br i1 [[L93_PRE5]], label [[BB2C]], label [[BB1E:%.*]]
+; CHECK-NEXT:    br i1 [[L93]], label [[BB2C]], label [[BB1E:%.*]]
 ; CHECK:       BB1e:
 ; CHECK-NEXT:    br i1 [[AZ]], label [[BB2F:%.*]], label [[BB4:%.*]]
 ; CHECK:       BB2f:
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-abs-srshl.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-abs-srshl.ll
index 7fb0fbdda0b5..f71aaa289b89 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-abs-srshl.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-abs-srshl.ll
@@ -42,8 +42,7 @@ define <vscale x 8 x i16> @srshl_abs_positive_merge(<vscale x 8 x i16> %a, <vsca
 
 define <vscale x 8 x i16> @srshl_abs_all_active_pred(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i1> %pg2) #0 {
 ; CHECK-LABEL: @srshl_abs_all_active_pred(
-; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[ABS:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.abs.nxv8i16(<vscale x 8 x i16> [[B:%.*]], <vscale x 8 x i1> [[PG]], <vscale x 8 x i16> [[A:%.*]])
+; CHECK-NEXT:    [[ABS:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.abs.nxv8i16(<vscale x 8 x i16> [[B:%.*]], <vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> [[A:%.*]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.lsl.nxv8i16(<vscale x 8 x i1> [[PG2:%.*]], <vscale x 8 x i16> [[ABS]], <vscale x 8 x i16> splat (i16 2))
 ; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-all-active-lanes-cvt.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-all-active-lanes-cvt.ll
index b8ea4de3d238..1c5f7464d858 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-all-active-lanes-cvt.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-all-active-lanes-cvt.ll
@@ -5,8 +5,7 @@ target triple = "aarch64-unknown-linux-gnu"
 define <vscale x 8 x bfloat> @test_fcvt_bf16_f32_poison(<vscale x 8 x bfloat> %a, <vscale x 4 x float> %b) {
 ; CHECK-LABEL: define <vscale x 8 x bfloat> @test_fcvt_bf16_f32_poison(
 ; CHECK-SAME: <vscale x 8 x bfloat> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) {
-; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32.v2(<vscale x 8 x bfloat> poison, <vscale x 4 x i1> [[PG]], <vscale x 4 x float> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32.v2(<vscale x 8 x bfloat> poison, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> [[B]])
 ; CHECK-NEXT:    ret <vscale x 8 x bfloat> [[OUT]]
 ;
   %pg = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
@@ -17,8 +16,7 @@ define <vscale x 8 x bfloat> @test_fcvt_bf16_f32_poison(<vscale x 8 x bfloat> %a
 define <vscale x 8 x bfloat> @test_fcvt_bf16_f32(<vscale x 8 x bfloat> %a, <vscale x 4 x float> %b) {
 ; CHECK-LABEL: define <vscale x 8 x bfloat> @test_fcvt_bf16_f32(
 ; CHECK-SAME: <vscale x 8 x bfloat> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) {
-; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32.v2(<vscale x 8 x bfloat> undef, <vscale x 4 x i1> [[PG]], <vscale x 4 x float> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32.v2(<vscale x 8 x bfloat> undef, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> [[B]])
 ; CHECK-NEXT:    ret <vscale x 8 x bfloat> [[OUT]]
 ;
   %pg = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
@@ -29,8 +27,7 @@ define <vscale x 8 x bfloat> @test_fcvt_bf16_f32(<vscale x 8 x bfloat> %a, <vsca
 define <vscale x 8 x half> @test_fcvt_f16_f32(<vscale x 8 x half> %a, <vscale x 4 x float> %b) {
 ; CHECK-LABEL: define <vscale x 8 x half> @test_fcvt_f16_f32(
 ; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) {
-; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f32(<vscale x 8 x half> undef, <vscale x 4 x i1> [[PG]], <vscale x 4 x float> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f32(<vscale x 8 x half> undef, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> [[B]])
 ; CHECK-NEXT:    ret <vscale x 8 x half> [[OUT]]
 ;
   %pg = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
@@ -41,8 +38,7 @@ define <vscale x 8 x half> @test_fcvt_f16_f32(<vscale x 8 x half> %a, <vscale x
 define <vscale x 8 x half> @test_fcvt_f16_f64(<vscale x 8 x half> %a, <vscale x 2 x double> %b) {
 ; CHECK-LABEL: define <vscale x 8 x half> @test_fcvt_f16_f64(
 ; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) {
-; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f64(<vscale x 8 x half> undef, <vscale x 2 x i1> [[PG]], <vscale x 2 x double> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f64(<vscale x 8 x half> undef, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x double> [[B]])
 ; CHECK-NEXT:    ret <vscale x 8 x half> [[OUT]]
 ;
   %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
@@ -53,8 +49,7 @@ define <vscale x 8 x half> @test_fcvt_f16_f64(<vscale x 8 x half> %a, <vscale x
 define <vscale x 4 x float> @test_fcvt_f32_f16(<vscale x 4 x float> %a, <vscale x 8 x half> %b) {
 ; CHECK-LABEL: define <vscale x 4 x float> @test_fcvt_f32_f16(
 ; CHECK-SAME: <vscale x 4 x float> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
-; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f16(<vscale x 4 x float> undef, <vscale x 4 x i1> [[PG]], <vscale x 8 x half> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f16(<vscale x 4 x float> undef, <vscale x 4 x i1> splat (i1 true), <vscale x 8 x half> [[B]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[OUT]]
 ;
   %pg = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
@@ -65,8 +60,7 @@ define <vscale x 4 x float> @test_fcvt_f32_f16(<vscale x 4 x float> %a, <vscale
 define <vscale x 4 x float> @test_fcvt_f32_f64(<vscale x 4 x float> %a, <vscale x 2 x double> %b) {
 ; CHECK-LABEL: define <vscale x 4 x float> @test_fcvt_f32_f64(
 ; CHECK-SAME: <vscale x 4 x float> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) {
-; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f64(<vscale x 4 x float> undef, <vscale x 2 x i1> [[PG]], <vscale x 2 x double> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f64(<vscale x 4 x float> undef, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x double> [[B]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[OUT]]
 ;
   %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
@@ -77,8 +71,7 @@ define <vscale x 4 x float> @test_fcvt_f32_f64(<vscale x 4 x float> %a, <vscale
 define <vscale x 2 x double> @test_fcvt_f64_f16(<vscale x 2 x double> %a, <vscale x 8 x half> %b) {
 ; CHECK-LABEL: define <vscale x 2 x double> @test_fcvt_f64_f16(
 ; CHECK-SAME: <vscale x 2 x double> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
-; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f16(<vscale x 2 x double> undef, <vscale x 2 x i1> [[PG]], <vscale x 8 x half> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f16(<vscale x 2 x double> undef, <vscale x 2 x i1> splat (i1 true), <vscale x 8 x half> [[B]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[OUT]]
 ;
   %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
@@ -89,8 +82,7 @@ define <vscale x 2 x double> @test_fcvt_f64_f16(<vscale x 2 x double> %a, <vscal
 define <vscale x 2 x double> @test_fcvt_f64_f32(<vscale x 2 x double> %a, <vscale x 4 x float> %b) {
 ; CHECK-LABEL: define <vscale x 2 x double> @test_fcvt_f64_f32(
 ; CHECK-SAME: <vscale x 2 x double> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) {
-; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f32(<vscale x 2 x double> undef, <vscale x 2 x i1> [[PG]], <vscale x 4 x float> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f32(<vscale x 2 x double> undef, <vscale x 2 x i1> splat (i1 true), <vscale x 4 x float> [[B]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[OUT]]
 ;
   %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
@@ -101,8 +93,7 @@ define <vscale x 2 x double> @test_fcvt_f64_f32(<vscale x 2 x double> %a, <vscal
 define <vscale x 4 x float> @test_fcvtlt_f32_f16(<vscale x 4 x float> %a, <vscale x 8 x half> %b) {
 ; CHECK-LABEL: define <vscale x 4 x float> @test_fcvtlt_f32_f16(
 ; CHECK-SAME: <vscale x 4 x float> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
-; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fcvtlt.f32f16(<vscale x 4 x float> undef, <vscale x 4 x i1> [[PG]], <vscale x 8 x half> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fcvtlt.f32f16(<vscale x 4 x float> undef, <vscale x 4 x i1> splat (i1 true), <vscale x 8 x half> [[B]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[OUT]]
 ;
   %pg = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
@@ -113,8 +104,7 @@ define <vscale x 4 x float> @test_fcvtlt_f32_f16(<vscale x 4 x float> %a, <vscal
 define <vscale x 2 x double> @test_fcvtlt_f64_f32(<vscale x 2 x double> %a, <vscale x 4 x float> %b) {
 ; CHECK-LABEL: define <vscale x 2 x double> @test_fcvtlt_f64_f32(
 ; CHECK-SAME: <vscale x 2 x double> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) {
-; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fcvtlt.f64f32(<vscale x 2 x double> undef, <vscale x 2 x i1> [[PG]], <vscale x 4 x float> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fcvtlt.f64f32(<vscale x 2 x double> undef, <vscale x 2 x i1> splat (i1 true), <vscale x 4 x float> [[B]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[OUT]]
 ;
   %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
@@ -125,8 +115,7 @@ define <vscale x 2 x double> @test_fcvtlt_f64_f32(<vscale x 2 x double> %a, <vsc
 define <vscale x 8 x bfloat> @test_fcvtnt_bf16_f32(<vscale x 8 x bfloat> %a, <vscale x 4 x float> %b) {
 ; CHECK-LABEL: define <vscale x 8 x bfloat> @test_fcvtnt_bf16_f32(
 ; CHECK-SAME: <vscale x 8 x bfloat> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) {
-; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvtnt.bf16f32.v2(<vscale x 8 x bfloat> [[A]], <vscale x 4 x i1> [[PG]], <vscale x 4 x float> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvtnt.bf16f32.v2(<vscale x 8 x bfloat> [[A]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> [[B]])
 ; CHECK-NEXT:    ret <vscale x 8 x bfloat> [[OUT]]
 ;
   %pg = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
@@ -137,8 +126,7 @@ define <vscale x 8 x bfloat> @test_fcvtnt_bf16_f32(<vscale x 8 x bfloat> %a, <vs
 define <vscale x 8 x half> @test_fcvtnt_f16_f32(<vscale x 8 x half> %a, <vscale x 4 x float> %b) {
 ; CHECK-LABEL: define <vscale x 8 x half> @test_fcvtnt_f16_f32(
 ; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) {
-; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.fcvtnt.f16f32(<vscale x 8 x half> [[A]], <vscale x 4 x i1> [[PG]], <vscale x 4 x float> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.fcvtnt.f16f32(<vscale x 8 x half> [[A]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> [[B]])
 ; CHECK-NEXT:    ret <vscale x 8 x half> [[OUT]]
 ;
   %pg = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
@@ -149,8 +137,7 @@ define <vscale x 8 x half> @test_fcvtnt_f16_f32(<vscale x 8 x half> %a, <vscale
 define <vscale x 4 x float> @test_fcvtnt_f32_f64(<vscale x 4 x float> %a, <vscale x 2 x double> %b) {
 ; CHECK-LABEL: define <vscale x 4 x float> @test_fcvtnt_f32_f64(
 ; CHECK-SAME: <vscale x 4 x float> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) {
-; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fcvtnt.f32f64(<vscale x 4 x float> [[A]], <vscale x 2 x i1> [[PG]], <vscale x 2 x double> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fcvtnt.f32f64(<vscale x 4 x float> [[A]], <vscale x 2 x i1> splat (i1 true), <vscale x 2 x double> [[B]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[OUT]]
 ;
   %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
@@ -161,8 +148,7 @@ define <vscale x 4 x float> @test_fcvtnt_f32_f64(<vscale x 4 x float> %a, <vscal
 define <vscale x 4 x float> @test_fcvtx_f32_f64(<vscale x 4 x float> %a, <vscale x 2 x double> %b) {
 ; CHECK-LABEL: define <vscale x 4 x float> @test_fcvtx_f32_f64(
 ; CHECK-SAME: <vscale x 4 x float> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) {
-; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fcvtx.f32f64(<vscale x 4 x float> undef, <vscale x 2 x i1> [[PG]], <vscale x 2 x double> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fcvtx.f32f64(<vscale x 4 x float> undef, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x double> [[B]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[OUT]]
 ;
   %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
@@ -173,8 +159,7 @@ define <vscale x 4 x float> @test_fcvtx_f32_f64(<vscale x 4 x float> %a, <vscale
 define <vscale x 4 x float> @test_fcvtxnt_f32_f64(<vscale x 4 x float> %a, <vscale x 2 x double> %b) {
 ; CHECK-LABEL: define <vscale x 4 x float> @test_fcvtxnt_f32_f64(
 ; CHECK-SAME: <vscale x 4 x float> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) {
-; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fcvtxnt.f32f64(<vscale x 4 x float> [[A]], <vscale x 2 x i1> [[PG]], <vscale x 2 x double> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fcvtxnt.f32f64(<vscale x 4 x float> [[A]], <vscale x 2 x i1> splat (i1 true), <vscale x 2 x double> [[B]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[OUT]]
 ;
   %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
@@ -185,8 +170,7 @@ define <vscale x 4 x float> @test_fcvtxnt_f32_f64(<vscale x 4 x float> %a, <vsca
 define <vscale x 8 x i16> @test_fcvtzs(<vscale x 8 x i16> %a, <vscale x 8 x half> %b) {
 ; CHECK-LABEL: define <vscale x 8 x i16> @test_fcvtzs(
 ; CHECK-SAME: <vscale x 8 x i16> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
-; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.fcvtzs.nxv8i16.nxv8f16(<vscale x 8 x i16> undef, <vscale x 8 x i1> [[PG]], <vscale x 8 x half> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.fcvtzs.nxv8i16.nxv8f16(<vscale x 8 x i16> undef, <vscale x 8 x i1> splat (i1 true), <vscale x 8 x half> [[B]])
 ; CHECK-NEXT:    ret <vscale x 8 x i16> [[OUT]]
 ;
   %pg = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
@@ -197,8 +181,7 @@ define <vscale x 8 x i16> @test_fcvtzs(<vscale x 8 x i16> %a, <vscale x 8 x half
 define <vscale x 4 x i32> @test_fcvtzs_i32_f16(<vscale x 4 x i32> %a, <vscale x 8 x half> %b) {
 ; CHECK-LABEL: define <vscale x 4 x i32> @test_fcvtzs_i32_f16(
 ; CHECK-SAME: <vscale x 4 x i32> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
-; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f16(<vscale x 4 x i32> undef, <vscale x 4 x i1> [[PG]], <vscale x 8 x half> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f16(<vscale x 4 x i32> undef, <vscale x 4 x i1> splat (i1 true), <vscale x 8 x half> [[B]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[OUT]]
 ;
   %pg = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
@@ -209,8 +192,7 @@ define <vscale x 4 x i32> @test_fcvtzs_i32_f16(<vscale x 4 x i32> %a, <vscale x
 define <vscale x 4 x i32> @test_fcvtzs_i32_f64(<vscale x 4 x i32> %a, <vscale x 2 x double> %b) {
 ; CHECK-LABEL: define <vscale x 4 x i32> @test_fcvtzs_i32_f64(
 ; CHECK-SAME: <vscale x 4 x i32> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) {
-; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f64(<vscale x 4 x i32> undef, <vscale x 2 x i1> [[PG]], <vscale x 2 x double> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f64(<vscale x 4 x i32> undef, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x double> [[B]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[OUT]]
 ;
   %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
@@ -221,8 +203,7 @@ define <vscale x 4 x i32> @test_fcvtzs_i32_f64(<vscale x 4 x i32> %a, <vscale x
 define <vscale x 2 x i64> @test_fcvtzs_i64_f16(<vscale x 2 x i64> %a, <vscale x 8 x half> %b) {
 ; CHECK-LABEL: define <vscale x 2 x i64> @test_fcvtzs_i64_f16(
 ; CHECK-SAME: <vscale x 2 x i64> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
-; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f16(<vscale x 2 x i64> undef, <vscale x 2 x i1> [[PG]], <vscale x 8 x half> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f16(<vscale x 2 x i64> undef, <vscale x 2 x i1> splat (i1 true), <vscale x 8 x half> [[B]])
 ; CHECK-NEXT:    ret <vscale x 2 x i64> [[OUT]]
 ;
   %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
@@ -233,8 +214,7 @@ define <vscale x 2 x i64> @test_fcvtzs_i64_f16(<vscale x 2 x i64> %a, <vscale x
 define <vscale x 2 x i64> @test_fcvtzs_i64_f32(<vscale x 2 x i64> %a, <vscale x 4 x float> %b) {
 ; CHECK-LABEL: define <vscale x 2 x i64> @test_fcvtzs_i64_f32(
 ; CHECK-SAME: <vscale x 2 x i64> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) {
-; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f32(<vscale x 2 x i64> undef, <vscale x 2 x i1> [[PG]], <vscale x 4 x float> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f32(<vscale x 2 x i64> undef, <vscale x 2 x i1> splat (i1 true), <vscale x 4 x float> [[B]])
 ; CHECK-NEXT:    ret <vscale x 2 x i64> [[OUT]]
 ;
   %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
@@ -245,8 +225,7 @@ define <vscale x 2 x i64> @test_fcvtzs_i64_f32(<vscale x 2 x i64> %a, <vscale x
 define <vscale x 8 x i16> @test_fcvtzu(<vscale x 8 x i16> %a, <vscale x 8 x half> %b) {
 ; CHECK-LABEL: define <vscale x 8 x i16> @test_fcvtzu(
 ; CHECK-SAME: <vscale x 8 x i16> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
-; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.fcvtzu.nxv8i16.nxv8f16(<vscale x 8 x i16> undef, <vscale x 8 x i1> [[PG]], <vscale x 8 x half> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.fcvtzu.nxv8i16.nxv8f16(<vscale x 8 x i16> undef, <vscale x 8 x i1> splat (i1 true), <vscale x 8 x half> [[B]])
 ; CHECK-NEXT:    ret <vscale x 8 x i16> [[OUT]]
 ;
   %pg = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
@@ -257,8 +236,7 @@ define <vscale x 8 x i16> @test_fcvtzu(<vscale x 8 x i16> %a, <vscale x 8 x half
 define <vscale x 4 x i32> @test_fcvtzu_i32_f16(<vscale x 4 x i32> %a, <vscale x 8 x half> %b) {
 ; CHECK-LABEL: define <vscale x 4 x i32> @test_fcvtzu_i32_f16(
 ; CHECK-SAME: <vscale x 4 x i32> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
-; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f16(<vscale x 4 x i32> undef, <vscale x 4 x i1> [[PG]], <vscale x 8 x half> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f16(<vscale x 4 x i32> undef, <vscale x 4 x i1> splat (i1 true), <vscale x 8 x half> [[B]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[OUT]]
 ;
   %pg = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
@@ -269,8 +247,7 @@ define <vscale x 4 x i32> @test_fcvtzu_i32_f16(<vscale x 4 x i32> %a, <vscale x
 define <vscale x 4 x i32> @test_fcvtzu_i32_f64(<vscale x 4 x i32> %a, <vscale x 2 x double> %b) {
 ; CHECK-LABEL: define <vscale x 4 x i32> @test_fcvtzu_i32_f64(
 ; CHECK-SAME: <vscale x 4 x i32> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) {
-; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f64(<vscale x 4 x i32> undef, <vscale x 2 x i1> [[PG]], <vscale x 2 x double> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f64(<vscale x 4 x i32> undef, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x double> [[B]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[OUT]]
 ;
   %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
@@ -281,8 +258,7 @@ define <vscale x 4 x i32> @test_fcvtzu_i32_f64(<vscale x 4 x i32> %a, <vscale x
 define <vscale x 2 x i64> @test_fcvtzu_i64_f16(<vscale x 2 x i64> %a, <vscale x 8 x half> %b) {
 ; CHECK-LABEL: define <vscale x 2 x i64> @test_fcvtzu_i64_f16(
 ; CHECK-SAME: <vscale x 2 x i64> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
-; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f16(<vscale x 2 x i64> undef, <vscale x 2 x i1> [[PG]], <vscale x 8 x half> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f16(<vscale x 2 x i64> undef, <vscale x 2 x i1> splat (i1 true), <vscale x 8 x half> [[B]])
 ; CHECK-NEXT:    ret <vscale x 2 x i64> [[OUT]]
 ;
   %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
@@ -293,8 +269,7 @@ define <vscale x 2 x i64> @test_fcvtzu_i64_f16(<vscale x 2 x i64> %a, <vscale x
 define <vscale x 2 x i64> @test_fcvtzu_i64_f32(<vscale x 2 x i64> %a, <vscale x 4 x float> %b) {
 ; CHECK-LABEL: define <vscale x 2 x i64> @test_fcvtzu_i64_f32(
 ; CHECK-SAME: <vscale x 2 x i64> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) {
-; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f32(<vscale x 2 x i64> undef, <vscale x 2 x i1> [[PG]], <vscale x 4 x float> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f32(<vscale x 2 x i64> undef, <vscale x 2 x i1> splat (i1 true), <vscale x 4 x float> [[B]])
 ; CHECK-NEXT:    ret <vscale x 2 x i64> [[OUT]]
 ;
   %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
@@ -305,8 +280,7 @@ define <vscale x 2 x i64> @test_fcvtzu_i64_f32(<vscale x 2 x i64> %a, <vscale x
 define <vscale x 8 x half> @test_scvtf(<vscale x 8 x half> %a, <vscale x 8 x i16> %b) {
 ; CHECK-LABEL: define <vscale x 8 x half> @test_scvtf(
 ; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 8 x i16> [[B:%.*]]) {
-; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.scvtf.nxv8f16.nxv8i16(<vscale x 8 x half> undef, <vscale x 8 x i1> [[PG]], <vscale x 8 x i16> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.scvtf.nxv8f16.nxv8i16(<vscale x 8 x half> undef, <vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> [[B]])
 ; CHECK-NEXT:    ret <vscale x 8 x half> [[OUT]]
 ;
   %pg = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
@@ -317,8 +291,7 @@ define <vscale x 8 x half> @test_scvtf(<vscale x 8 x half> %a, <vscale x 8 x i16
 define <vscale x 8 x half> @test_scvtf_f16_i32(<vscale x 8 x half> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: define <vscale x 8 x half> @test_scvtf_f16_i32(
 ; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) {
-; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.scvtf.f16i32(<vscale x 8 x half> undef, <vscale x 4 x i1> [[PG]], <vscale x 4 x i32> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.scvtf.f16i32(<vscale x 8 x half> undef, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[B]])
 ; CHECK-NEXT:    ret <vscale x 8 x half> [[OUT]]
 ;
   %pg = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
@@ -329,8 +302,7 @@ define <vscale x 8 x half> @test_scvtf_f16_i32(<vscale x 8 x half> %a, <vscale x
 define <vscale x 8 x half> @test_scvtf_f16_i64(<vscale x 8 x half> %a,<vscale x 2 x i64> %b) {
 ; CHECK-LABEL: define <vscale x 8 x half> @test_scvtf_f16_i64(
 ; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
-; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.scvtf.f16i64(<vscale x 8 x half> undef, <vscale x 2 x i1> [[PG]], <vscale x 2 x i64> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.scvtf.f16i64(<vscale x 8 x half> undef, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[B]])
 ; CHECK-NEXT:    ret <vscale x 8 x half> [[OUT]]
 ;
   %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
@@ -341,8 +313,7 @@ define <vscale x 8 x half> @test_scvtf_f16_i64(<vscale x 8 x half> %a,<vscale x
 define <vscale x 4 x float> @test_scvtf_f32_i64(<vscale x 4 x float> %a, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: define <vscale x 4 x float> @test_scvtf_f32_i64(
 ; CHECK-SAME: <vscale x 4 x float> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
-; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.scvtf.f32i64(<vscale x 4 x float> undef, <vscale x 2 x i1> [[PG]], <vscale x 2 x i64> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.scvtf.f32i64(<vscale x 4 x float> undef, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[B]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[OUT]]
 ;
   %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
@@ -353,8 +324,7 @@ define <vscale x 4 x float> @test_scvtf_f32_i64(<vscale x 4 x float> %a, <vscale
 define <vscale x 2 x double>  @test_scvtf_f64_i32(<vscale x 2 x double>  %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: define <vscale x 2 x double> @test_scvtf_f64_i32(
 ; CHECK-SAME: <vscale x 2 x double> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) {
-; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.scvtf.f64i32(<vscale x 2 x double> undef, <vscale x 2 x i1> [[PG]], <vscale x 4 x i32> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.scvtf.f64i32(<vscale x 2 x double> undef, <vscale x 2 x i1> splat (i1 true), <vscale x 4 x i32> [[B]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[OUT]]
 ;
   %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
@@ -365,8 +335,7 @@ define <vscale x 2 x double>  @test_scvtf_f64_i32(<vscale x 2 x double>  %a, <vs
 define <vscale x 8 x half> @test_ucvtf(<vscale x 8 x half> %a, <vscale x 8 x i16> %b) {
 ; CHECK-LABEL: define <vscale x 8 x half> @test_ucvtf(
 ; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 8 x i16> [[B:%.*]]) {
-; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.ucvtf.nxv8f16.nxv8i16(<vscale x 8 x half> undef, <vscale x 8 x i1> [[PG]], <vscale x 8 x i16> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.ucvtf.nxv8f16.nxv8i16(<vscale x 8 x half> undef, <vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> [[B]])
 ; CHECK-NEXT:    ret <vscale x 8 x half> [[OUT]]
 ;
   %pg = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
@@ -377,8 +346,7 @@ define <vscale x 8 x half> @test_ucvtf(<vscale x 8 x half> %a, <vscale x 8 x i16
 define <vscale x 8 x half> @test_ucvtf_f16_i32(<vscale x 8 x half> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: define <vscale x 8 x half> @test_ucvtf_f16_i32(
 ; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) {
-; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.ucvtf.f16i32(<vscale x 8 x half> undef, <vscale x 4 x i1> [[PG]], <vscale x 4 x i32> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.ucvtf.f16i32(<vscale x 8 x half> undef, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[B]])
 ; CHECK-NEXT:    ret <vscale x 8 x half> [[OUT]]
 ;
   %pg = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
@@ -389,8 +357,7 @@ define <vscale x 8 x half> @test_ucvtf_f16_i32(<vscale x 8 x half> %a, <vscale x
 define <vscale x 8 x half> @test_ucvtf_f16_i64(<vscale x 8 x half> %a,<vscale x 2 x i64> %b) {
 ; CHECK-LABEL: define <vscale x 8 x half> @test_ucvtf_f16_i64(
 ; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
-; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.ucvtf.f16i64(<vscale x 8 x half> undef, <vscale x 2 x i1> [[PG]], <vscale x 2 x i64> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.ucvtf.f16i64(<vscale x 8 x half> undef, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[B]])
 ; CHECK-NEXT:    ret <vscale x 8 x half> [[OUT]]
 ;
   %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
@@ -401,8 +368,7 @@ define <vscale x 8 x half> @test_ucvtf_f16_i64(<vscale x 8 x half> %a,<vscale x
 define <vscale x 4 x float> @test_ucvtf_f32_i64(<vscale x 4 x float> %a, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: define <vscale x 4 x float> @test_ucvtf_f32_i64(
 ; CHECK-SAME: <vscale x 4 x float> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
-; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.ucvtf.f32i64(<vscale x 4 x float> undef, <vscale x 2 x i1> [[PG]], <vscale x 2 x i64> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.ucvtf.f32i64(<vscale x 4 x float> undef, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[B]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[OUT]]
 ;
   %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
@@ -413,8 +379,7 @@ define <vscale x 4 x float> @test_ucvtf_f32_i64(<vscale x 4 x float> %a, <vscale
 define <vscale x 2 x double>  @test_ucvtf_f64_i32(<vscale x 2 x double>  %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: define <vscale x 2 x double> @test_ucvtf_f64_i32(
 ; CHECK-SAME: <vscale x 2 x double> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) {
-; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.ucvtf.f64i32(<vscale x 2 x double> undef, <vscale x 2 x i1> [[PG]], <vscale x 4 x i32> [[B]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.ucvtf.f64i32(<vscale x 2 x double> undef, <vscale x 2 x i1> splat (i1 true), <vscale x 4 x i32> [[B]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[OUT]]
 ;
   %pg = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-loadstore.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-loadstore.ll
index c67662f87250..d8d674029853 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-loadstore.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-loadstore.ll
@@ -5,7 +5,7 @@ target triple = "aarch64-unknown-linux-gnu"
 
 define <vscale x 4 x i32> @combine_ld1(ptr %ptr) #0 {
 ; CHECK-LABEL: @combine_ld1(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 4 x i32>, ptr [[PTR:%.*]], align 16, !annotation !0
+; CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 4 x i32>, ptr [[PTR:%.*]], align 16, !annotation [[META0:![0-9]+]]
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
@@ -15,7 +15,7 @@ define <vscale x 4 x i32> @combine_ld1(ptr %ptr) #0 {
 
 define <vscale x 4 x i32> @combine_ld1_casted_predicate(ptr %ptr) #0 {
 ; CHECK-LABEL: @combine_ld1_casted_predicate(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 4 x i32>, ptr [[PTR:%.*]], align 16, !annotation !0
+; CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 4 x i32>, ptr [[PTR:%.*]], align 16, !annotation [[META0]]
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
@@ -28,7 +28,7 @@ define <vscale x 4 x i32> @combine_ld1_casted_predicate(ptr %ptr) #0 {
 define <vscale x 4 x i32> @combine_ld1_masked(ptr %ptr) #0 {
 ; CHECK-LABEL: @combine_ld1_masked(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 16)
-; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[PTR:%.*]], i32 1, <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> zeroinitializer), !annotation !0
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[PTR:%.*]], i32 1, <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> zeroinitializer), !annotation [[META0]]
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 16)
@@ -38,10 +38,9 @@ define <vscale x 4 x i32> @combine_ld1_masked(ptr %ptr) #0 {
 
 define <vscale x 8 x i16> @combine_ld1_masked_casted_predicate(ptr %ptr) #0 {
 ; CHECK-LABEL: @combine_ld1_masked_casted_predicate(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[TMP2]])
-; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[PTR:%.*]], i32 1, <vscale x 8 x i1> [[TMP3]], <vscale x 8 x i16> zeroinitializer), !annotation !0
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[PTR:%.*]], i32 1, <vscale x 8 x i1> [[TMP3]], <vscale x 8 x i16> zeroinitializer), !annotation [[META0]]
 ; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP5]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
@@ -53,7 +52,7 @@ define <vscale x 8 x i16> @combine_ld1_masked_casted_predicate(ptr %ptr) #0 {
 
 define void @combine_st1(<vscale x 4 x i32> %vec, ptr %ptr) #0 {
 ; CHECK-LABEL: @combine_st1(
-; CHECK-NEXT:    store <vscale x 4 x i32> [[VEC:%.*]], ptr [[PTR:%.*]], align 16, !annotation !0
+; CHECK-NEXT:    store <vscale x 4 x i32> [[VEC:%.*]], ptr [[PTR:%.*]], align 16, !annotation [[META0]]
 ; CHECK-NEXT:    ret void
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
@@ -63,7 +62,7 @@ define void @combine_st1(<vscale x 4 x i32> %vec, ptr %ptr) #0 {
 
 define void @combine_st1_casted_predicate(<vscale x 4 x i32> %vec, ptr %ptr) #0 {
 ; CHECK-LABEL: @combine_st1_casted_predicate(
-; CHECK-NEXT:    store <vscale x 4 x i32> [[VEC:%.*]], ptr [[PTR:%.*]], align 16, !annotation !0
+; CHECK-NEXT:    store <vscale x 4 x i32> [[VEC:%.*]], ptr [[PTR:%.*]], align 16, !annotation [[META0]]
 ; CHECK-NEXT:    ret void
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
@@ -76,7 +75,7 @@ define void @combine_st1_casted_predicate(<vscale x 4 x i32> %vec, ptr %ptr) #0
 define void @combine_st1_masked(<vscale x 4 x i32> %vec, ptr %ptr) #0 {
 ; CHECK-LABEL: @combine_st1_masked(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 16)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[VEC:%.*]], ptr [[PTR:%.*]], i32 1, <vscale x 4 x i1> [[TMP1]]), !annotation !0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[VEC:%.*]], ptr [[PTR:%.*]], i32 1, <vscale x 4 x i1> [[TMP1]]), !annotation [[META0]]
 ; CHECK-NEXT:    ret void
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 16)
@@ -86,10 +85,9 @@ define void @combine_st1_masked(<vscale x 4 x i32> %vec, ptr %ptr) #0 {
 
 define void @combine_st1_masked_casted_predicate(<vscale x 8 x i16> %vec, ptr %ptr) #0 {
 ; CHECK-LABEL: @combine_st1_masked_casted_predicate(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[TMP2]])
-; CHECK-NEXT:    call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[VEC:%.*]], ptr [[PTR:%.*]], i32 1, <vscale x 8 x i1> [[TMP3]]), !annotation !0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[VEC:%.*]], ptr [[PTR:%.*]], i32 1, <vscale x 8 x i1> [[TMP3]]), !annotation [[META0]]
 ; CHECK-NEXT:    ret void
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-cmpne.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-cmpne.ll
index 6a2c0f8689ca..93de2e731606 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-cmpne.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-cmpne.ll
@@ -22,8 +22,7 @@ define <vscale x 16 x i1> @dupq_b_0() #0 {
 define <vscale x 16 x i1> @dupq_b_d() #0 {
 ; CHECK-LABEL: define <vscale x 16 x i1> @dupq_b_d(
 ; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> splat (i1 true))
 ; CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP2]]
 ;
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
@@ -38,8 +37,7 @@ define <vscale x 16 x i1> @dupq_b_d() #0 {
 define <vscale x 16 x i1> @dupq_b_w() #0 {
 ; CHECK-LABEL: define <vscale x 16 x i1> @dupq_b_w(
 ; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP2]]
 ;
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
@@ -54,8 +52,7 @@ define <vscale x 16 x i1> @dupq_b_w() #0 {
 define <vscale x 16 x i1> @dupq_b_h() #0 {
 ; CHECK-LABEL: define <vscale x 16 x i1> @dupq_b_h(
 ; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> splat (i1 true))
 ; CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP2]]
 ;
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
@@ -70,8 +67,7 @@ define <vscale x 16 x i1> @dupq_b_h() #0 {
 define <vscale x 16 x i1> @dupq_b_b() #0 {
 ; CHECK-LABEL: define <vscale x 16 x i1> @dupq_b_b(
 ; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
-; CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
+; CHECK-NEXT:    ret <vscale x 16 x i1> splat (i1 true)
 ;
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
   %2 = tail call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> poison,
@@ -101,9 +97,8 @@ define <vscale x 8 x i1> @dupq_h_0() #0 {
 define <vscale x 8 x i1> @dupq_h_d() #0 {
 ; CHECK-LABEL: define <vscale x 8 x i1> @dupq_h_d(
 ; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[TMP2]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[TMP1]])
 ; CHECK-NEXT:    ret <vscale x 8 x i1> [[TMP3]]
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
@@ -118,9 +113,8 @@ define <vscale x 8 x i1> @dupq_h_d() #0 {
 define <vscale x 8 x i1> @dupq_h_w() #0 {
 ; CHECK-LABEL: define <vscale x 8 x i1> @dupq_h_w(
 ; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[TMP2]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[TMP1]])
 ; CHECK-NEXT:    ret <vscale x 8 x i1> [[TMP3]]
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
@@ -135,8 +129,7 @@ define <vscale x 8 x i1> @dupq_h_w() #0 {
 define <vscale x 8 x i1> @dupq_h_h() #0 {
 ; CHECK-LABEL: define <vscale x 8 x i1> @dupq_h_h(
 ; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    ret <vscale x 8 x i1> [[TMP1]]
+; CHECK-NEXT:    ret <vscale x 8 x i1> splat (i1 true)
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %2 = tail call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> poison,
@@ -166,9 +159,8 @@ define <vscale x 4 x i1> @dupq_w_0() #0 {
 define <vscale x 4 x i1> @dupq_w_d() #0 {
 ; CHECK-LABEL: define <vscale x 4 x i1> @dupq_w_d(
 ; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[TMP2]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[TMP1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i1> [[TMP3]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
@@ -183,8 +175,7 @@ define <vscale x 4 x i1> @dupq_w_d() #0 {
 define <vscale x 4 x i1> @dupq_w_w() #0 {
 ; CHECK-LABEL: define <vscale x 4 x i1> @dupq_w_w(
 ; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    ret <vscale x 4 x i1> [[TMP1]]
+; CHECK-NEXT:    ret <vscale x 4 x i1> splat (i1 true)
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %2 = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison,
@@ -214,8 +205,7 @@ define <vscale x 2 x i1> @dupq_d_0() #0 {
 define <vscale x 2 x i1> @dupq_d_d() #0 {
 ; CHECK-LABEL: define <vscale x 2 x i1> @dupq_d_d(
 ; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    ret <vscale x 2 x i1> [[TMP1]]
+; CHECK-NEXT:    ret <vscale x 2 x i1> splat (i1 true)
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %2 = tail call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> poison,
@@ -231,10 +221,9 @@ define <vscale x 2 x i1> @dupq_d_d() #0 {
 define <vscale x 2 x i1> @dupq_neg1() #0 {
 ; CHECK-LABEL: define <vscale x 2 x i1> @dupq_neg1(
 ; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> poison, <2 x i64> <i64 1, i64 0>, i64 0)
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.cmpne.nxv2i64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> zeroinitializer)
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.cmpne.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x i1> [[TMP4]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
@@ -249,10 +238,9 @@ define <vscale x 2 x i1> @dupq_neg1() #0 {
 define <vscale x 4 x i1> @dupq_neg2() #0 {
 ; CHECK-LABEL: define <vscale x 4 x i1> @dupq_neg2(
 ; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 0, i32 1>, i64 0)
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32> [[TMP2]], i64 0)
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmpne.wide.nxv4i32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[TMP3]], <vscale x 2 x i64> zeroinitializer)
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmpne.wide.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP3]], <vscale x 2 x i64> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 4 x i1> [[TMP4]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
@@ -267,10 +255,9 @@ define <vscale x 4 x i1> @dupq_neg2() #0 {
 define <vscale x 4 x i1> @dupq_neg3() #0 {
 ; CHECK-LABEL: define <vscale x 4 x i1> @dupq_neg3(
 ; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>, i64 0)
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32> [[TMP2]], i64 0)
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmpne.wide.nxv4i32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[TMP3]], <vscale x 2 x i64> zeroinitializer)
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmpne.wide.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP3]], <vscale x 2 x i64> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 4 x i1> [[TMP4]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
@@ -285,10 +272,9 @@ define <vscale x 4 x i1> @dupq_neg3() #0 {
 define <vscale x 4 x i1> @dupq_neg4() #0 {
 ; CHECK-LABEL: define <vscale x 4 x i1> @dupq_neg4(
 ; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 0, i32 0>, i64 0)
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32> [[TMP2]], i64 0)
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmpne.wide.nxv4i32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[TMP3]], <vscale x 2 x i64> zeroinitializer)
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmpne.wide.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP3]], <vscale x 2 x i64> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 4 x i1> [[TMP4]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
@@ -303,10 +289,9 @@ define <vscale x 4 x i1> @dupq_neg4() #0 {
 define <vscale x 4 x i1> @dupq_neg5() #0 {
 ; CHECK-LABEL: define <vscale x 4 x i1> @dupq_neg5(
 ; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>, i64 0)
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32> [[TMP2]], i64 0)
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmpne.wide.nxv4i32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[TMP3]], <vscale x 2 x i64> zeroinitializer)
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmpne.wide.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP3]], <vscale x 2 x i64> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 4 x i1> [[TMP4]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
@@ -321,12 +306,11 @@ define <vscale x 4 x i1> @dupq_neg5() #0 {
 define <vscale x 4 x i1> @dupq_neg6(i1 %a) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i1> @dupq_neg6(
 ; CHECK-SAME: i1 [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i1 [[A]] to i32
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> <i32 1, i32 1, i32 1, i32 poison>, i32 [[TMP2]], i64 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> [[TMP3]], i64 0)
 ; CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32> [[TMP4]], i64 0)
-; CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmpne.wide.nxv4i32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[TMP5]], <vscale x 2 x i64> zeroinitializer)
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmpne.wide.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP5]], <vscale x 2 x i64> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 4 x i1> [[TMP6]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
@@ -342,10 +326,9 @@ define <vscale x 4 x i1> @dupq_neg6(i1 %a) #0 {
 define <vscale x 2 x i1> @dupq_neg7() #0 {
 ; CHECK-LABEL: define <vscale x 2 x i1> @dupq_neg7(
 ; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> poison, <2 x i64> splat (i64 1), i64 2)
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.cmpne.nxv2i64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> zeroinitializer)
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.cmpne.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x i1> [[TMP4]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
@@ -360,10 +343,9 @@ define <vscale x 2 x i1> @dupq_neg7() #0 {
 define <vscale x 2 x i1> @dupq_neg8() #0 {
 ; CHECK-LABEL: define <vscale x 2 x i1> @dupq_neg8(
 ; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> poison, <2 x i64> splat (i64 1), i64 0)
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 1)
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.cmpne.nxv2i64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> zeroinitializer)
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.cmpne.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x i1> [[TMP4]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
@@ -378,10 +360,9 @@ define <vscale x 2 x i1> @dupq_neg8() #0 {
 define <vscale x 2 x i1> @dupq_neg9(<vscale x 2 x i64> %x) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i1> @dupq_neg9(
 ; CHECK-SAME: <vscale x 2 x i64> [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> [[X]], <2 x i64> splat (i64 1), i64 0)
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.cmpne.nxv2i64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> zeroinitializer)
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.cmpne.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x i1> [[TMP4]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
@@ -396,10 +377,9 @@ define <vscale x 2 x i1> @dupq_neg9(<vscale x 2 x i64> %x) #0 {
 define <vscale x 2 x i1> @dupq_neg10() #0 {
 ; CHECK-LABEL: define <vscale x 2 x i1> @dupq_neg10(
 ; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> poison, <2 x i64> splat (i64 1), i64 0)
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.cmpne.nxv2i64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> splat (i64 1))
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.cmpne.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> splat (i64 1))
 ; CHECK-NEXT:    ret <vscale x 2 x i1> [[TMP4]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
@@ -448,10 +428,9 @@ define <vscale x 2 x i1> @dupq_neg12() #0 {
 define <vscale x 2 x i1> @dupq_neg13(<vscale x 2 x i64> %x) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i1> @dupq_neg13(
 ; CHECK-SAME: <vscale x 2 x i64> [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> poison, <2 x i64> splat (i64 1), i64 0)
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.cmpne.nxv2i64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[X]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.cmpne.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[X]])
 ; CHECK-NEXT:    ret <vscale x 2 x i1> [[TMP4]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
@@ -465,10 +444,9 @@ define <vscale x 2 x i1> @dupq_neg13(<vscale x 2 x i64> %x) #0 {
 define <vscale x 16 x i1> @dupq_b_idx(i64 %idx) #0 {
 ; CHECK-LABEL: define <vscale x 16 x i1> @dupq_b_idx(
 ; CHECK-SAME: i64 [[IDX:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> poison, <16 x i8> zeroinitializer, i64 0)
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8> [[TMP2]], i64 [[IDX]])
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpne.wide.nxv16i8(<vscale x 16 x i1> [[TMP1]], <vscale x 16 x i8> [[TMP3]], <vscale x 2 x i64> zeroinitializer)
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpne.wide.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> [[TMP3]], <vscale x 2 x i64> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP4]]
 ;
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-strictfp.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-strictfp.ll
index f6f60d6d64e7..690bcd3dc033 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-strictfp.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-strictfp.ll
@@ -8,8 +8,7 @@ target triple = "aarch64-unknown-linux-gnu"
 define <vscale x 2 x double> @replace_fadd_intrinsic_double_strictfp(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
 ; CHECK: Function Attrs: strictfp
 ; CHECK-LABEL: @replace_fadd_intrinsic_double_strictfp(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) #[[ATTR2:[0-9]+]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.fadd.u.nxv2f64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.fadd.u.nxv2f64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR2:[0-9]+]]
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) #1
@@ -22,8 +21,7 @@ define <vscale x 2 x double> @replace_fadd_intrinsic_double_strictfp(<vscale x 2
 define <vscale x 2 x double> @call_replace_fadd_intrinsic_double_strictfp(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
 ; CHECK: Function Attrs: strictfp
 ; CHECK-LABEL: @call_replace_fadd_intrinsic_double_strictfp(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fadd.u.nxv2f64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fadd.u.nxv2f64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
 ;
   %1 = call <vscale x 2 x double> @replace_fadd_intrinsic_double_strictfp(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #1
@@ -35,8 +33,7 @@ define <vscale x 2 x double> @call_replace_fadd_intrinsic_double_strictfp(<vscal
 define <vscale x 2 x double> @replace_fmul_intrinsic_double_strictfp(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
 ; CHECK: Function Attrs: strictfp
 ; CHECK-LABEL: @replace_fmul_intrinsic_double_strictfp(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) #1
@@ -49,8 +46,7 @@ define <vscale x 2 x double> @replace_fmul_intrinsic_double_strictfp(<vscale x 2
 define <vscale x 2 x double> @call_replace_fmul_intrinsic_double_strictfp(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
 ; CHECK: Function Attrs: strictfp
 ; CHECK-LABEL: @call_replace_fmul_intrinsic_double_strictfp(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.u.nxv2f64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
 ;
   %1 = call <vscale x 2 x double> @replace_fmul_intrinsic_double_strictfp(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #1
@@ -62,8 +58,7 @@ define <vscale x 2 x double> @call_replace_fmul_intrinsic_double_strictfp(<vscal
 define <vscale x 2 x double> @replace_fsub_intrinsic_double_strictfp(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
 ; CHECK: Function Attrs: strictfp
 ; CHECK-LABEL: @replace_fsub_intrinsic_double_strictfp(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.fsub.u.nxv2f64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.fsub.u.nxv2f64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) #1
@@ -76,8 +71,7 @@ define <vscale x 2 x double> @replace_fsub_intrinsic_double_strictfp(<vscale x 2
 define <vscale x 2 x double> @call_replace_fsub_intrinsic_double_strictfp(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
 ; CHECK: Function Attrs: strictfp
 ; CHECK-LABEL: @call_replace_fsub_intrinsic_double_strictfp(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fsub.u.nxv2f64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fsub.u.nxv2f64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
 ;
   %1 = call <vscale x 2 x double> @replace_fsub_intrinsic_double_strictfp(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #1
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsics-combine-to-u-forms.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsics-combine-to-u-forms.ll
index 60b2efe27168..bbf4b3c65c30 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsics-combine-to-u-forms.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsics-combine-to-u-forms.ll
@@ -16,9 +16,8 @@ declare <vscale x 8 x half> @llvm.aarch64.sve.fabd.nxv8f16(<vscale x 8 x i1>, <v
 define <vscale x 8 x half> @replace_fabd_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
 ; CHECK-LABEL: define <vscale x 8 x half> @replace_fabd_intrinsic_half
 ; CHECK-SAME: (<vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fabd.u.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
-; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fabd.u.nxv8f16(<vscale x 8 x i1> splat (i1 true), <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fabd.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
@@ -29,9 +28,8 @@ declare <vscale x 4 x float> @llvm.aarch64.sve.fabd.nxv4f32(<vscale x 4 x i1>, <
 define <vscale x 4 x float> @replace_fabd_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @replace_fabd_intrinsic_float
 ; CHECK-SAME: (<vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fabd.u.nxv4f32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x float> [[A]], <vscale x 4 x float> [[B]])
-; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fabd.u.nxv4f32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> [[A]], <vscale x 4 x float> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %2 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fabd.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
@@ -42,9 +40,8 @@ declare <vscale x 2 x double> @llvm.aarch64.sve.fabd.nxv2f64(<vscale x 2 x i1>,
 define <vscale x 2 x double> @replace_fabd_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @replace_fabd_intrinsic_double
 ; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fabd.u.nxv2f64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]])
-; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fabd.u.nxv2f64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fabd.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
@@ -117,9 +114,8 @@ declare <vscale x 8 x half> @llvm.aarch64.sve.fdiv.nxv8f16(<vscale x 8 x i1>, <v
 define <vscale x 8 x half> @replace_fdiv_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
 ; CHECK-LABEL: define <vscale x 8 x half> @replace_fdiv_intrinsic_half
 ; CHECK-SAME: (<vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fdiv.u.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
-; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fdiv.u.nxv8f16(<vscale x 8 x i1> splat (i1 true), <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fdiv.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
@@ -130,9 +126,8 @@ declare <vscale x 4 x float> @llvm.aarch64.sve.fdiv.nxv4f32(<vscale x 4 x i1>, <
 define <vscale x 4 x float> @replace_fdiv_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @replace_fdiv_intrinsic_float
 ; CHECK-SAME: (<vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fdiv.u.nxv4f32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x float> [[A]], <vscale x 4 x float> [[B]])
-; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fdiv.u.nxv4f32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> [[A]], <vscale x 4 x float> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %2 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fdiv.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
@@ -143,9 +138,8 @@ declare <vscale x 2 x double> @llvm.aarch64.sve.fdiv.nxv2f64(<vscale x 2 x i1>,
 define <vscale x 2 x double> @replace_fdiv_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @replace_fdiv_intrinsic_double
 ; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fdiv.u.nxv2f64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]])
-; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fdiv.u.nxv2f64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fdiv.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
@@ -168,9 +162,8 @@ declare <vscale x 8 x half> @llvm.aarch64.sve.fmax.nxv8f16(<vscale x 8 x i1>, <v
 define <vscale x 8 x half> @replace_fmax_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
 ; CHECK-LABEL: define <vscale x 8 x half> @replace_fmax_intrinsic_half
 ; CHECK-SAME: (<vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmax.u.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
-; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmax.u.nxv8f16(<vscale x 8 x i1> splat (i1 true), <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmax.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
@@ -181,9 +174,8 @@ declare <vscale x 4 x float> @llvm.aarch64.sve.fmax.nxv4f32(<vscale x 4 x i1>, <
 define <vscale x 4 x float> @replace_fmax_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @replace_fmax_intrinsic_float
 ; CHECK-SAME: (<vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fmax.u.nxv4f32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x float> [[A]], <vscale x 4 x float> [[B]])
-; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fmax.u.nxv4f32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> [[A]], <vscale x 4 x float> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %2 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fmax.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
@@ -194,9 +186,8 @@ declare <vscale x 2 x double> @llvm.aarch64.sve.fmax.nxv2f64(<vscale x 2 x i1>,
 define <vscale x 2 x double> @replace_fmax_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @replace_fmax_intrinsic_double
 ; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fmax.u.nxv2f64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]])
-; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fmax.u.nxv2f64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fmax.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
@@ -219,9 +210,8 @@ declare <vscale x 8 x half> @llvm.aarch64.sve.fmaxnm.nxv8f16(<vscale x 8 x i1>,
 define <vscale x 8 x half> @replace_fmaxnm_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
 ; CHECK-LABEL: define <vscale x 8 x half> @replace_fmaxnm_intrinsic_half
 ; CHECK-SAME: (<vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmaxnm.u.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
-; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmaxnm.u.nxv8f16(<vscale x 8 x i1> splat (i1 true), <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmaxnm.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
@@ -232,9 +222,8 @@ declare <vscale x 4 x float> @llvm.aarch64.sve.fmaxnm.nxv4f32(<vscale x 4 x i1>,
 define <vscale x 4 x float> @replace_fmaxnm_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @replace_fmaxnm_intrinsic_float
 ; CHECK-SAME: (<vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fmaxnm.u.nxv4f32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x float> [[A]], <vscale x 4 x float> [[B]])
-; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fmaxnm.u.nxv4f32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> [[A]], <vscale x 4 x float> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %2 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fmaxnm.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
@@ -245,9 +234,8 @@ declare <vscale x 2 x double> @llvm.aarch64.sve.fmaxnm.nxv2f64(<vscale x 2 x i1>
 define <vscale x 2 x double> @replace_fmaxnm_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @replace_fmaxnm_intrinsic_double
 ; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fmaxnm.u.nxv2f64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]])
-; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fmaxnm.u.nxv2f64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fmaxnm.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
@@ -270,9 +258,8 @@ declare <vscale x 8 x half> @llvm.aarch64.sve.fmin.nxv8f16(<vscale x 8 x i1>, <v
 define <vscale x 8 x half> @replace_fmin_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
 ; CHECK-LABEL: define <vscale x 8 x half> @replace_fmin_intrinsic_half
 ; CHECK-SAME: (<vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmin.u.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
-; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmin.u.nxv8f16(<vscale x 8 x i1> splat (i1 true), <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmin.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
@@ -283,9 +270,8 @@ declare <vscale x 4 x float> @llvm.aarch64.sve.fmin.nxv4f32(<vscale x 4 x i1>, <
 define <vscale x 4 x float> @replace_fmin_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @replace_fmin_intrinsic_float
 ; CHECK-SAME: (<vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fmin.u.nxv4f32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x float> [[A]], <vscale x 4 x float> [[B]])
-; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fmin.u.nxv4f32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> [[A]], <vscale x 4 x float> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %2 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fmin.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
@@ -296,9 +282,8 @@ declare <vscale x 2 x double> @llvm.aarch64.sve.fmin.nxv2f64(<vscale x 2 x i1>,
 define <vscale x 2 x double> @replace_fmin_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @replace_fmin_intrinsic_double
 ; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fmin.u.nxv2f64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]])
-; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fmin.u.nxv2f64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fmin.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
@@ -321,9 +306,8 @@ declare <vscale x 8 x half> @llvm.aarch64.sve.fminnm.nxv8f16(<vscale x 8 x i1>,
 define <vscale x 8 x half> @replace_fminnm_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
 ; CHECK-LABEL: define <vscale x 8 x half> @replace_fminnm_intrinsic_half
 ; CHECK-SAME: (<vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fminnm.u.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
-; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fminnm.u.nxv8f16(<vscale x 8 x i1> splat (i1 true), <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fminnm.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
@@ -334,9 +318,8 @@ declare <vscale x 4 x float> @llvm.aarch64.sve.fminnm.nxv4f32(<vscale x 4 x i1>,
 define <vscale x 4 x float> @replace_fminnm_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @replace_fminnm_intrinsic_float
 ; CHECK-SAME: (<vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fminnm.u.nxv4f32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x float> [[A]], <vscale x 4 x float> [[B]])
-; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fminnm.u.nxv4f32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> [[A]], <vscale x 4 x float> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %2 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fminnm.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
@@ -347,9 +330,8 @@ declare <vscale x 2 x double> @llvm.aarch64.sve.fminnm.nxv2f64(<vscale x 2 x i1>
 define <vscale x 2 x double> @replace_fminnm_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @replace_fminnm_intrinsic_double
 ; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fminnm.u.nxv2f64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]])
-; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fminnm.u.nxv2f64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fminnm.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
@@ -372,9 +354,8 @@ declare <vscale x 8 x half> @llvm.aarch64.sve.fmla.nxv8f16(<vscale x 8 x i1>, <v
 define <vscale x 8 x half> @replace_fmla_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) #0 {
 ; CHECK-LABEL: define <vscale x 8 x half> @replace_fmla_intrinsic_half
 ; CHECK-SAME: (<vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]], <vscale x 8 x half> [[C:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.u.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]], <vscale x 8 x half> [[C]])
-; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.u.nxv8f16(<vscale x 8 x i1> splat (i1 true), <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]], <vscale x 8 x half> [[C]])
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c)
@@ -385,9 +366,8 @@ declare <vscale x 4 x float> @llvm.aarch64.sve.fmla.nxv4f32(<vscale x 4 x i1>, <
 define <vscale x 4 x float> @replace_fmla_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @replace_fmla_intrinsic_float
 ; CHECK-SAME: (<vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]], <vscale x 4 x float> [[C:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fmla.u.nxv4f32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x float> [[A]], <vscale x 4 x float> [[B]], <vscale x 4 x float> [[C]])
-; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fmla.u.nxv4f32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> [[A]], <vscale x 4 x float> [[B]], <vscale x 4 x float> [[C]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %2 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fmla.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c)
@@ -398,9 +378,8 @@ declare <vscale x 2 x double> @llvm.aarch64.sve.fmla.nxv2f64(<vscale x 2 x i1>,
 define <vscale x 2 x double> @replace_fmla_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @replace_fmla_intrinsic_double
 ; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]], <vscale x 2 x double> [[C:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fmla.u.nxv2f64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]], <vscale x 2 x double> [[C]])
-; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fmla.u.nxv2f64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]], <vscale x 2 x double> [[C]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fmla.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c)
@@ -423,9 +402,8 @@ declare <vscale x 8 x half> @llvm.aarch64.sve.fmls.nxv8f16(<vscale x 8 x i1>, <v
 define <vscale x 8 x half> @replace_fmls_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) #0 {
 ; CHECK-LABEL: define <vscale x 8 x half> @replace_fmls_intrinsic_half
 ; CHECK-SAME: (<vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]], <vscale x 8 x half> [[C:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmls.u.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]], <vscale x 8 x half> [[C]])
-; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmls.u.nxv8f16(<vscale x 8 x i1> splat (i1 true), <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]], <vscale x 8 x half> [[C]])
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmls.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c)
@@ -436,9 +414,8 @@ declare <vscale x 4 x float> @llvm.aarch64.sve.fmls.nxv4f32(<vscale x 4 x i1>, <
 define <vscale x 4 x float> @replace_fmls_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @replace_fmls_intrinsic_float
 ; CHECK-SAME: (<vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]], <vscale x 4 x float> [[C:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fmls.u.nxv4f32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x float> [[A]], <vscale x 4 x float> [[B]], <vscale x 4 x float> [[C]])
-; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fmls.u.nxv4f32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> [[A]], <vscale x 4 x float> [[B]], <vscale x 4 x float> [[C]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %2 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fmls.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c)
@@ -449,9 +426,8 @@ declare <vscale x 2 x double> @llvm.aarch64.sve.fmls.nxv2f64(<vscale x 2 x i1>,
 define <vscale x 2 x double> @replace_fmls_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @replace_fmls_intrinsic_double
 ; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]], <vscale x 2 x double> [[C:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fmls.u.nxv2f64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]], <vscale x 2 x double> [[C]])
-; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fmls.u.nxv2f64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]], <vscale x 2 x double> [[C]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fmls.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c)
@@ -524,9 +500,8 @@ declare <vscale x 8 x half> @llvm.aarch64.sve.fmulx.nxv8f16(<vscale x 8 x i1>, <
 define <vscale x 8 x half> @replace_fmulx_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
 ; CHECK-LABEL: define <vscale x 8 x half> @replace_fmulx_intrinsic_half
 ; CHECK-SAME: (<vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmulx.u.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
-; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmulx.u.nxv8f16(<vscale x 8 x i1> splat (i1 true), <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmulx.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
@@ -537,9 +512,8 @@ declare <vscale x 4 x float> @llvm.aarch64.sve.fmulx.nxv4f32(<vscale x 4 x i1>,
 define <vscale x 4 x float> @replace_fmulx_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @replace_fmulx_intrinsic_float
 ; CHECK-SAME: (<vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fmulx.u.nxv4f32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x float> [[A]], <vscale x 4 x float> [[B]])
-; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fmulx.u.nxv4f32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> [[A]], <vscale x 4 x float> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %2 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fmulx.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
@@ -550,9 +524,8 @@ declare <vscale x 2 x double> @llvm.aarch64.sve.fmulx.nxv2f64(<vscale x 2 x i1>,
 define <vscale x 2 x double> @replace_fmulx_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @replace_fmulx_intrinsic_double
 ; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fmulx.u.nxv2f64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]])
-; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fmulx.u.nxv2f64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fmulx.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
@@ -575,9 +548,8 @@ declare <vscale x 8 x half> @llvm.aarch64.sve.fnmla.nxv8f16(<vscale x 8 x i1>, <
 define <vscale x 8 x half> @replace_fnmla_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) #0 {
 ; CHECK-LABEL: define <vscale x 8 x half> @replace_fnmla_intrinsic_half
 ; CHECK-SAME: (<vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]], <vscale x 8 x half> [[C:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fnmla.u.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]], <vscale x 8 x half> [[C]])
-; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fnmla.u.nxv8f16(<vscale x 8 x i1> splat (i1 true), <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]], <vscale x 8 x half> [[C]])
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fnmla.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c)
@@ -588,9 +560,8 @@ declare <vscale x 4 x float> @llvm.aarch64.sve.fnmla.nxv4f32(<vscale x 4 x i1>,
 define <vscale x 4 x float> @replace_fnmla_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @replace_fnmla_intrinsic_float
 ; CHECK-SAME: (<vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]], <vscale x 4 x float> [[C:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fnmla.u.nxv4f32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x float> [[A]], <vscale x 4 x float> [[B]], <vscale x 4 x float> [[C]])
-; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fnmla.u.nxv4f32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> [[A]], <vscale x 4 x float> [[B]], <vscale x 4 x float> [[C]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %2 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fnmla.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c)
@@ -601,9 +572,8 @@ declare <vscale x 2 x double> @llvm.aarch64.sve.fnmla.nxv2f64(<vscale x 2 x i1>,
 define <vscale x 2 x double> @replace_fnmla_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @replace_fnmla_intrinsic_double
 ; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]], <vscale x 2 x double> [[C:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fnmla.u.nxv2f64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]], <vscale x 2 x double> [[C]])
-; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fnmla.u.nxv2f64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]], <vscale x 2 x double> [[C]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fnmla.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c)
@@ -626,9 +596,8 @@ declare <vscale x 8 x half> @llvm.aarch64.sve.fnmls.nxv8f16(<vscale x 8 x i1>, <
 define <vscale x 8 x half> @replace_fnmls_intrinsic_half(<vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) #0 {
 ; CHECK-LABEL: define <vscale x 8 x half> @replace_fnmls_intrinsic_half
 ; CHECK-SAME: (<vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]], <vscale x 8 x half> [[C:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fnmls.u.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]], <vscale x 8 x half> [[C]])
-; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fnmls.u.nxv8f16(<vscale x 8 x i1> splat (i1 true), <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]], <vscale x 8 x half> [[C]])
+; CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fnmls.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c)
@@ -639,9 +608,8 @@ declare <vscale x 4 x float> @llvm.aarch64.sve.fnmls.nxv4f32(<vscale x 4 x i1>,
 define <vscale x 4 x float> @replace_fnmls_intrinsic_float(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @replace_fnmls_intrinsic_float
 ; CHECK-SAME: (<vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]], <vscale x 4 x float> [[C:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fnmls.u.nxv4f32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x float> [[A]], <vscale x 4 x float> [[B]], <vscale x 4 x float> [[C]])
-; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fnmls.u.nxv4f32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> [[A]], <vscale x 4 x float> [[B]], <vscale x 4 x float> [[C]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %2 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.fnmls.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c)
@@ -652,9 +620,8 @@ declare <vscale x 2 x double> @llvm.aarch64.sve.fnmls.nxv2f64(<vscale x 2 x i1>,
 define <vscale x 2 x double> @replace_fnmls_intrinsic_double(<vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @replace_fnmls_intrinsic_double
 ; CHECK-SAME: (<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]], <vscale x 2 x double> [[C:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fnmls.u.nxv2f64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]], <vscale x 2 x double> [[C]])
-; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fnmls.u.nxv2f64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x double> [[A]], <vscale x 2 x double> [[B]], <vscale x 2 x double> [[C]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %2 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.fnmls.nxv2f64(<vscale x 2 x i1> %1, <vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c)
@@ -729,9 +696,8 @@ declare <vscale x 16 x i8> @llvm.aarch64.sve.add.nxv16i8(<vscale x 16 x i1>, <vs
 define <vscale x 16 x i8> @replace_add_intrinsic_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
 ; CHECK-LABEL: define <vscale x 16 x i8> @replace_add_intrinsic_i8
 ; CHECK-SAME: (<vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.add.u.nxv16i8(<vscale x 16 x i1> [[TMP1]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
-; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.add.u.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP1]]
 ;
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
   %2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.add.nxv16i8(<vscale x 16 x i1> %1, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
@@ -742,9 +708,8 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.add.nxv8i16(<vscale x 8 x i1>, <vsc
 define <vscale x 8 x i16> @replace_add_intrinsic_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
 ; CHECK-LABEL: define <vscale x 8 x i16> @replace_add_intrinsic_i16
 ; CHECK-SAME: (<vscale x 8 x i16> [[A:%.*]], <vscale x 8 x i16> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.add.u.nxv8i16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
-; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.add.u.nxv8i16(<vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %2 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.add.nxv8i16(<vscale x 8 x i1> %1, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
@@ -755,9 +720,8 @@ declare <vscale x 4 x i32> @llvm.aarch64.sve.add.nxv4i32(<vscale x 4 x i1>, <vsc
 define <vscale x 4 x i32> @replace_add_intrinsic_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @replace_add_intrinsic_i32
 ; CHECK-SAME: (<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.u.nxv4i32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.u.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
@@ -768,9 +732,8 @@ declare <vscale x 2 x i64> @llvm.aarch64.sve.add.nxv2i64(<vscale x 2 x i1>, <vsc
 define <vscale x 2 x i64> @replace_add_intrinsic_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @replace_add_intrinsic_i64
 ; CHECK-SAME: (<vscale x 2 x i64> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.add.u.nxv2i64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
-; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.add.u.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.add.nxv2i64(<vscale x 2 x i1> %1, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
@@ -793,9 +756,8 @@ declare <vscale x 16 x i8> @llvm.aarch64.sve.mla.nxv16i8(<vscale x 16 x i1>, <vs
 define <vscale x 16 x i8> @replace_mla_intrinsic_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c) #0 {
 ; CHECK-LABEL: define <vscale x 16 x i8> @replace_mla_intrinsic_i8
 ; CHECK-SAME: (<vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], <vscale x 16 x i8> [[C:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.mla.u.nxv16i8(<vscale x 16 x i1> [[TMP1]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]], <vscale x 16 x i8> [[C]])
-; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.mla.u.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]], <vscale x 16 x i8> [[C]])
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP1]]
 ;
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
   %2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.mla.nxv16i8(<vscale x 16 x i1> %1, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c)
@@ -806,9 +768,8 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.mla.nxv8i16(<vscale x 8 x i1>, <vsc
 define <vscale x 8 x i16> @replace_mla_intrinsic_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c) #0 {
 ; CHECK-LABEL: define <vscale x 8 x i16> @replace_mla_intrinsic_i16
 ; CHECK-SAME: (<vscale x 8 x i16> [[A:%.*]], <vscale x 8 x i16> [[B:%.*]], <vscale x 8 x i16> [[C:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.mla.u.nxv8i16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]], <vscale x 8 x i16> [[C]])
-; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.mla.u.nxv8i16(<vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]], <vscale x 8 x i16> [[C]])
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %2 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.mla.nxv8i16(<vscale x 8 x i1> %1, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c)
@@ -819,9 +780,8 @@ declare <vscale x 4 x i32> @llvm.aarch64.sve.mla.nxv4i32(<vscale x 4 x i1>, <vsc
 define <vscale x 4 x i32> @replace_mla_intrinsic_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @replace_mla_intrinsic_i32
 ; CHECK-SAME: (<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.mla.u.nxv4i32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[C]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.mla.u.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[C]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.mla.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c)
@@ -832,9 +792,8 @@ declare <vscale x 2 x i64> @llvm.aarch64.sve.mla.nxv2i64(<vscale x 2 x i1>, <vsc
 define <vscale x 2 x i64> @replace_mla_intrinsic_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @replace_mla_intrinsic_i64
 ; CHECK-SAME: (<vscale x 2 x i64> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]], <vscale x 2 x i64> [[C:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.mla.u.nxv2i64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]], <vscale x 2 x i64> [[C]])
-; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.mla.u.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]], <vscale x 2 x i64> [[C]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.mla.nxv2i64(<vscale x 2 x i1> %1, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c)
@@ -857,9 +816,8 @@ declare <vscale x 16 x i8> @llvm.aarch64.sve.mls.nxv16i8(<vscale x 16 x i1>, <vs
 define <vscale x 16 x i8> @replace_mls_intrinsic_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c) #0 {
 ; CHECK-LABEL: define <vscale x 16 x i8> @replace_mls_intrinsic_i8
 ; CHECK-SAME: (<vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], <vscale x 16 x i8> [[C:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.mls.u.nxv16i8(<vscale x 16 x i1> [[TMP1]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]], <vscale x 16 x i8> [[C]])
-; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.mls.u.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]], <vscale x 16 x i8> [[C]])
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP1]]
 ;
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
   %2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.mls.nxv16i8(<vscale x 16 x i1> %1, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c)
@@ -870,9 +828,8 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.mls.nxv8i16(<vscale x 8 x i1>, <vsc
 define <vscale x 8 x i16> @replace_mls_intrinsic_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c) #0 {
 ; CHECK-LABEL: define <vscale x 8 x i16> @replace_mls_intrinsic_i16
 ; CHECK-SAME: (<vscale x 8 x i16> [[A:%.*]], <vscale x 8 x i16> [[B:%.*]], <vscale x 8 x i16> [[C:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.mls.u.nxv8i16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]], <vscale x 8 x i16> [[C]])
-; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.mls.u.nxv8i16(<vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]], <vscale x 8 x i16> [[C]])
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %2 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.mls.nxv8i16(<vscale x 8 x i1> %1, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c)
@@ -883,9 +840,8 @@ declare <vscale x 4 x i32> @llvm.aarch64.sve.mls.nxv4i32(<vscale x 4 x i1>, <vsc
 define <vscale x 4 x i32> @replace_mls_intrinsic_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @replace_mls_intrinsic_i32
 ; CHECK-SAME: (<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.mls.u.nxv4i32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[C]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.mls.u.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[C]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.mls.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c)
@@ -896,9 +852,8 @@ declare <vscale x 2 x i64> @llvm.aarch64.sve.mls.nxv2i64(<vscale x 2 x i1>, <vsc
 define <vscale x 2 x i64> @replace_mls_intrinsic_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @replace_mls_intrinsic_i64
 ; CHECK-SAME: (<vscale x 2 x i64> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]], <vscale x 2 x i64> [[C:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.mls.u.nxv2i64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]], <vscale x 2 x i64> [[C]])
-; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.mls.u.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]], <vscale x 2 x i64> [[C]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.mls.nxv2i64(<vscale x 2 x i1> %1, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c)
@@ -921,9 +876,8 @@ declare <vscale x 16 x i8> @llvm.aarch64.sve.mul.nxv16i8(<vscale x 16 x i1>, <vs
 define <vscale x 16 x i8> @replace_mul_intrinsic_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
 ; CHECK-LABEL: define <vscale x 16 x i8> @replace_mul_intrinsic_i8
 ; CHECK-SAME: (<vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.mul.u.nxv16i8(<vscale x 16 x i1> [[TMP1]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
-; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.mul.u.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP1]]
 ;
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
   %2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.mul.nxv16i8(<vscale x 16 x i1> %1, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
@@ -934,9 +888,8 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1>, <vsc
 define <vscale x 8 x i16> @replace_mul_intrinsic_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
 ; CHECK-LABEL: define <vscale x 8 x i16> @replace_mul_intrinsic_i16
 ; CHECK-SAME: (<vscale x 8 x i16> [[A:%.*]], <vscale x 8 x i16> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.mul.u.nxv8i16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
-; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.mul.u.nxv8i16(<vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %2 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %1, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
@@ -947,9 +900,8 @@ declare <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1>, <vsc
 define <vscale x 4 x i32> @replace_mul_intrinsic_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @replace_mul_intrinsic_i32
 ; CHECK-SAME: (<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
@@ -960,9 +912,8 @@ declare <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1>, <vsc
 define <vscale x 2 x i64> @replace_mul_intrinsic_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @replace_mul_intrinsic_i64
 ; CHECK-SAME: (<vscale x 2 x i64> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
-; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> %1, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
@@ -985,9 +936,8 @@ declare <vscale x 16 x i8> @llvm.aarch64.sve.sabd.nxv16i8(<vscale x 16 x i1>, <v
 define <vscale x 16 x i8> @replace_sabd_intrinsic_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
 ; CHECK-LABEL: define <vscale x 16 x i8> @replace_sabd_intrinsic_i8
 ; CHECK-SAME: (<vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.sabd.u.nxv16i8(<vscale x 16 x i1> [[TMP1]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
-; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.sabd.u.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP1]]
 ;
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
   %2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.sabd.nxv16i8(<vscale x 16 x i1> %1, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
@@ -998,9 +948,8 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.sabd.nxv8i16(<vscale x 8 x i1>, <vs
 define <vscale x 8 x i16> @replace_sabd_intrinsic_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
 ; CHECK-LABEL: define <vscale x 8 x i16> @replace_sabd_intrinsic_i16
 ; CHECK-SAME: (<vscale x 8 x i16> [[A:%.*]], <vscale x 8 x i16> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.sabd.u.nxv8i16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
-; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.sabd.u.nxv8i16(<vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %2 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.sabd.nxv8i16(<vscale x 8 x i1> %1, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
@@ -1011,9 +960,8 @@ declare <vscale x 4 x i32> @llvm.aarch64.sve.sabd.nxv4i32(<vscale x 4 x i1>, <vs
 define <vscale x 4 x i32> @replace_sabd_intrinsic_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @replace_sabd_intrinsic_i32
 ; CHECK-SAME: (<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.sabd.u.nxv4i32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.sabd.u.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.sabd.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
@@ -1024,9 +972,8 @@ declare <vscale x 2 x i64> @llvm.aarch64.sve.sabd.nxv2i64(<vscale x 2 x i1>, <vs
 define <vscale x 2 x i64> @replace_sabd_intrinsic_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @replace_sabd_intrinsic_i64
 ; CHECK-SAME: (<vscale x 2 x i64> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sabd.u.nxv2i64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
-; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sabd.u.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sabd.nxv2i64(<vscale x 2 x i1> %1, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
@@ -1049,9 +996,8 @@ declare <vscale x 16 x i8> @llvm.aarch64.sve.smax.nxv16i8(<vscale x 16 x i1>, <v
 define <vscale x 16 x i8> @replace_smax_intrinsic_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
 ; CHECK-LABEL: define <vscale x 16 x i8> @replace_smax_intrinsic_i8
 ; CHECK-SAME: (<vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.smax.u.nxv16i8(<vscale x 16 x i1> [[TMP1]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
-; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.smax.u.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP1]]
 ;
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
   %2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.smax.nxv16i8(<vscale x 16 x i1> %1, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
@@ -1062,9 +1008,8 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.smax.nxv8i16(<vscale x 8 x i1>, <vs
 define <vscale x 8 x i16> @replace_smax_intrinsic_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
 ; CHECK-LABEL: define <vscale x 8 x i16> @replace_smax_intrinsic_i16
 ; CHECK-SAME: (<vscale x 8 x i16> [[A:%.*]], <vscale x 8 x i16> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.smax.u.nxv8i16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
-; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.smax.u.nxv8i16(<vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %2 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.smax.nxv8i16(<vscale x 8 x i1> %1, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
@@ -1075,9 +1020,8 @@ declare <vscale x 4 x i32> @llvm.aarch64.sve.smax.nxv4i32(<vscale x 4 x i1>, <vs
 define <vscale x 4 x i32> @replace_smax_intrinsic_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @replace_smax_intrinsic_i32
 ; CHECK-SAME: (<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.smax.u.nxv4i32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.smax.u.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.smax.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
@@ -1088,9 +1032,8 @@ declare <vscale x 2 x i64> @llvm.aarch64.sve.smax.nxv2i64(<vscale x 2 x i1>, <vs
 define <vscale x 2 x i64> @replace_smax_intrinsic_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @replace_smax_intrinsic_i64
 ; CHECK-SAME: (<vscale x 2 x i64> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.smax.u.nxv2i64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
-; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.smax.u.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.smax.nxv2i64(<vscale x 2 x i1> %1, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
@@ -1113,9 +1056,8 @@ declare <vscale x 16 x i8> @llvm.aarch64.sve.smin.nxv16i8(<vscale x 16 x i1>, <v
 define <vscale x 16 x i8> @replace_smin_intrinsic_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
 ; CHECK-LABEL: define <vscale x 16 x i8> @replace_smin_intrinsic_i8
 ; CHECK-SAME: (<vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.smin.u.nxv16i8(<vscale x 16 x i1> [[TMP1]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
-; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.smin.u.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP1]]
 ;
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
   %2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.smin.nxv16i8(<vscale x 16 x i1> %1, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
@@ -1126,9 +1068,8 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.smin.nxv8i16(<vscale x 8 x i1>, <vs
 define <vscale x 8 x i16> @replace_smin_intrinsic_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
 ; CHECK-LABEL: define <vscale x 8 x i16> @replace_smin_intrinsic_i16
 ; CHECK-SAME: (<vscale x 8 x i16> [[A:%.*]], <vscale x 8 x i16> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.smin.u.nxv8i16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
-; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.smin.u.nxv8i16(<vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %2 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.smin.nxv8i16(<vscale x 8 x i1> %1, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
@@ -1139,9 +1080,8 @@ declare <vscale x 4 x i32> @llvm.aarch64.sve.smin.nxv4i32(<vscale x 4 x i1>, <vs
 define <vscale x 4 x i32> @replace_smin_intrinsic_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @replace_smin_intrinsic_i32
 ; CHECK-SAME: (<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.smin.u.nxv4i32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.smin.u.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.smin.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
@@ -1152,9 +1092,8 @@ declare <vscale x 2 x i64> @llvm.aarch64.sve.smin.nxv2i64(<vscale x 2 x i1>, <vs
 define <vscale x 2 x i64> @replace_smin_intrinsic_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @replace_smin_intrinsic_i64
 ; CHECK-SAME: (<vscale x 2 x i64> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.smin.u.nxv2i64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
-; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.smin.u.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.smin.nxv2i64(<vscale x 2 x i1> %1, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
@@ -1177,9 +1116,8 @@ declare <vscale x 16 x i8> @llvm.aarch64.sve.smulh.nxv16i8(<vscale x 16 x i1>, <
 define <vscale x 16 x i8> @replace_smulh_intrinsic_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
 ; CHECK-LABEL: define <vscale x 16 x i8> @replace_smulh_intrinsic_i8
 ; CHECK-SAME: (<vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.smulh.u.nxv16i8(<vscale x 16 x i1> [[TMP1]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
-; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.smulh.u.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP1]]
 ;
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
   %2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.smulh.nxv16i8(<vscale x 16 x i1> %1, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
@@ -1190,9 +1128,8 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.smulh.nxv8i16(<vscale x 8 x i1>, <v
 define <vscale x 8 x i16> @replace_smulh_intrinsic_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
 ; CHECK-LABEL: define <vscale x 8 x i16> @replace_smulh_intrinsic_i16
 ; CHECK-SAME: (<vscale x 8 x i16> [[A:%.*]], <vscale x 8 x i16> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.smulh.u.nxv8i16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
-; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.smulh.u.nxv8i16(<vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %2 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.smulh.nxv8i16(<vscale x 8 x i1> %1, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
@@ -1203,9 +1140,8 @@ declare <vscale x 4 x i32> @llvm.aarch64.sve.smulh.nxv4i32(<vscale x 4 x i1>, <v
 define <vscale x 4 x i32> @replace_smulh_intrinsic_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @replace_smulh_intrinsic_i32
 ; CHECK-SAME: (<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.smulh.u.nxv4i32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.smulh.u.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.smulh.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
@@ -1216,9 +1152,8 @@ declare <vscale x 2 x i64> @llvm.aarch64.sve.smulh.nxv2i64(<vscale x 2 x i1>, <v
 define <vscale x 2 x i64> @replace_smulh_intrinsic_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @replace_smulh_intrinsic_i64
 ; CHECK-SAME: (<vscale x 2 x i64> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.smulh.u.nxv2i64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
-; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.smulh.u.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.smulh.nxv2i64(<vscale x 2 x i1> %1, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
@@ -1241,9 +1176,8 @@ declare <vscale x 16 x i8> @llvm.aarch64.sve.sub.nxv16i8(<vscale x 16 x i1>, <vs
 define <vscale x 16 x i8> @replace_sub_intrinsic_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
 ; CHECK-LABEL: define <vscale x 16 x i8> @replace_sub_intrinsic_i8
 ; CHECK-SAME: (<vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.sub.u.nxv16i8(<vscale x 16 x i1> [[TMP1]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
-; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.sub.u.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP1]]
 ;
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
   %2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.sub.nxv16i8(<vscale x 16 x i1> %1, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
@@ -1254,9 +1188,8 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.sub.nxv8i16(<vscale x 8 x i1>, <vsc
 define <vscale x 8 x i16> @replace_sub_intrinsic_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
 ; CHECK-LABEL: define <vscale x 8 x i16> @replace_sub_intrinsic_i16
 ; CHECK-SAME: (<vscale x 8 x i16> [[A:%.*]], <vscale x 8 x i16> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.sub.u.nxv8i16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
-; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.sub.u.nxv8i16(<vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %2 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.sub.nxv8i16(<vscale x 8 x i1> %1, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
@@ -1267,9 +1200,8 @@ declare <vscale x 4 x i32> @llvm.aarch64.sve.sub.nxv4i32(<vscale x 4 x i1>, <vsc
 define <vscale x 4 x i32> @replace_sub_intrinsic_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @replace_sub_intrinsic_i32
 ; CHECK-SAME: (<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.sub.u.nxv4i32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.sub.u.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.sub.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
@@ -1280,9 +1212,8 @@ declare <vscale x 2 x i64> @llvm.aarch64.sve.sub.nxv2i64(<vscale x 2 x i1>, <vsc
 define <vscale x 2 x i64> @replace_sub_intrinsic_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @replace_sub_intrinsic_i64
 ; CHECK-SAME: (<vscale x 2 x i64> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sub.u.nxv2i64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
-; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sub.u.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sub.nxv2i64(<vscale x 2 x i1> %1, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
@@ -1305,9 +1236,8 @@ declare <vscale x 16 x i8> @llvm.aarch64.sve.uabd.nxv16i8(<vscale x 16 x i1>, <v
 define <vscale x 16 x i8> @replace_uabd_intrinsic_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
 ; CHECK-LABEL: define <vscale x 16 x i8> @replace_uabd_intrinsic_i8
 ; CHECK-SAME: (<vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.uabd.u.nxv16i8(<vscale x 16 x i1> [[TMP1]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
-; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.uabd.u.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP1]]
 ;
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
   %2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.uabd.nxv16i8(<vscale x 16 x i1> %1, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
@@ -1318,9 +1248,8 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.uabd.nxv8i16(<vscale x 8 x i1>, <vs
 define <vscale x 8 x i16> @replace_uabd_intrinsic_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
 ; CHECK-LABEL: define <vscale x 8 x i16> @replace_uabd_intrinsic_i16
 ; CHECK-SAME: (<vscale x 8 x i16> [[A:%.*]], <vscale x 8 x i16> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.uabd.u.nxv8i16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
-; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.uabd.u.nxv8i16(<vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %2 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.uabd.nxv8i16(<vscale x 8 x i1> %1, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
@@ -1331,9 +1260,8 @@ declare <vscale x 4 x i32> @llvm.aarch64.sve.uabd.nxv4i32(<vscale x 4 x i1>, <vs
 define <vscale x 4 x i32> @replace_uabd_intrinsic_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @replace_uabd_intrinsic_i32
 ; CHECK-SAME: (<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uabd.u.nxv4i32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uabd.u.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uabd.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
@@ -1344,9 +1272,8 @@ declare <vscale x 2 x i64> @llvm.aarch64.sve.uabd.nxv2i64(<vscale x 2 x i1>, <vs
 define <vscale x 2 x i64> @replace_uabd_intrinsic_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @replace_uabd_intrinsic_i64
 ; CHECK-SAME: (<vscale x 2 x i64> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uabd.u.nxv2i64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
-; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uabd.u.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uabd.nxv2i64(<vscale x 2 x i1> %1, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
@@ -1369,9 +1296,8 @@ declare <vscale x 16 x i8> @llvm.aarch64.sve.umax.nxv16i8(<vscale x 16 x i1>, <v
 define <vscale x 16 x i8> @replace_umax_intrinsic_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
 ; CHECK-LABEL: define <vscale x 16 x i8> @replace_umax_intrinsic_i8
 ; CHECK-SAME: (<vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.umax.u.nxv16i8(<vscale x 16 x i1> [[TMP1]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
-; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.umax.u.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP1]]
 ;
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
   %2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.umax.nxv16i8(<vscale x 16 x i1> %1, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
@@ -1382,9 +1308,8 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.umax.nxv8i16(<vscale x 8 x i1>, <vs
 define <vscale x 8 x i16> @replace_umax_intrinsic_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
 ; CHECK-LABEL: define <vscale x 8 x i16> @replace_umax_intrinsic_i16
 ; CHECK-SAME: (<vscale x 8 x i16> [[A:%.*]], <vscale x 8 x i16> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.umax.u.nxv8i16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
-; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.umax.u.nxv8i16(<vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %2 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.umax.nxv8i16(<vscale x 8 x i1> %1, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
@@ -1395,9 +1320,8 @@ declare <vscale x 4 x i32> @llvm.aarch64.sve.umax.nxv4i32(<vscale x 4 x i1>, <vs
 define <vscale x 4 x i32> @replace_umax_intrinsic_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @replace_umax_intrinsic_i32
 ; CHECK-SAME: (<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.umax.u.nxv4i32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.umax.u.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.umax.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
@@ -1408,9 +1332,8 @@ declare <vscale x 2 x i64> @llvm.aarch64.sve.umax.nxv2i64(<vscale x 2 x i1>, <vs
 define <vscale x 2 x i64> @replace_umax_intrinsic_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @replace_umax_intrinsic_i64
 ; CHECK-SAME: (<vscale x 2 x i64> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.umax.u.nxv2i64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
-; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.umax.u.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.umax.nxv2i64(<vscale x 2 x i1> %1, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
@@ -1433,9 +1356,8 @@ declare <vscale x 16 x i8> @llvm.aarch64.sve.umin.nxv16i8(<vscale x 16 x i1>, <v
 define <vscale x 16 x i8> @replace_umin_intrinsic_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
 ; CHECK-LABEL: define <vscale x 16 x i8> @replace_umin_intrinsic_i8
 ; CHECK-SAME: (<vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.umin.u.nxv16i8(<vscale x 16 x i1> [[TMP1]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
-; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.umin.u.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP1]]
 ;
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
   %2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.umin.nxv16i8(<vscale x 16 x i1> %1, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
@@ -1446,9 +1368,8 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.umin.nxv8i16(<vscale x 8 x i1>, <vs
 define <vscale x 8 x i16> @replace_umin_intrinsic_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
 ; CHECK-LABEL: define <vscale x 8 x i16> @replace_umin_intrinsic_i16
 ; CHECK-SAME: (<vscale x 8 x i16> [[A:%.*]], <vscale x 8 x i16> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.umin.u.nxv8i16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
-; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.umin.u.nxv8i16(<vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %2 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.umin.nxv8i16(<vscale x 8 x i1> %1, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
@@ -1459,9 +1380,8 @@ declare <vscale x 4 x i32> @llvm.aarch64.sve.umin.nxv4i32(<vscale x 4 x i1>, <vs
 define <vscale x 4 x i32> @replace_umin_intrinsic_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @replace_umin_intrinsic_i32
 ; CHECK-SAME: (<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.umin.u.nxv4i32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.umin.u.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.umin.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
@@ -1472,9 +1392,8 @@ declare <vscale x 2 x i64> @llvm.aarch64.sve.umin.nxv2i64(<vscale x 2 x i1>, <vs
 define <vscale x 2 x i64> @replace_umin_intrinsic_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @replace_umin_intrinsic_i64
 ; CHECK-SAME: (<vscale x 2 x i64> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.umin.u.nxv2i64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
-; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.umin.u.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.umin.nxv2i64(<vscale x 2 x i1> %1, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
@@ -1497,9 +1416,8 @@ declare <vscale x 16 x i8> @llvm.aarch64.sve.umulh.nxv16i8(<vscale x 16 x i1>, <
 define <vscale x 16 x i8> @replace_umulh_intrinsic_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
 ; CHECK-LABEL: define <vscale x 16 x i8> @replace_umulh_intrinsic_i8
 ; CHECK-SAME: (<vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.umulh.u.nxv16i8(<vscale x 16 x i1> [[TMP1]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
-; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.umulh.u.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP1]]
 ;
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
   %2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.umulh.nxv16i8(<vscale x 16 x i1> %1, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
@@ -1510,9 +1428,8 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.umulh.nxv8i16(<vscale x 8 x i1>, <v
 define <vscale x 8 x i16> @replace_umulh_intrinsic_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
 ; CHECK-LABEL: define <vscale x 8 x i16> @replace_umulh_intrinsic_i16
 ; CHECK-SAME: (<vscale x 8 x i16> [[A:%.*]], <vscale x 8 x i16> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.umulh.u.nxv8i16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
-; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.umulh.u.nxv8i16(<vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %2 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.umulh.nxv8i16(<vscale x 8 x i1> %1, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
@@ -1523,9 +1440,8 @@ declare <vscale x 4 x i32> @llvm.aarch64.sve.umulh.nxv4i32(<vscale x 4 x i1>, <v
 define <vscale x 4 x i32> @replace_umulh_intrinsic_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @replace_umulh_intrinsic_i32
 ; CHECK-SAME: (<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.umulh.u.nxv4i32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.umulh.u.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.umulh.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
@@ -1536,9 +1452,8 @@ declare <vscale x 2 x i64> @llvm.aarch64.sve.umulh.nxv2i64(<vscale x 2 x i1>, <v
 define <vscale x 2 x i64> @replace_umulh_intrinsic_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @replace_umulh_intrinsic_i64
 ; CHECK-SAME: (<vscale x 2 x i64> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.umulh.u.nxv2i64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
-; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.umulh.u.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.umulh.nxv2i64(<vscale x 2 x i1> %1, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
@@ -1563,9 +1478,8 @@ declare <vscale x 16 x i8> @llvm.aarch64.sve.asr.nxv16i8(<vscale x 16 x i1>, <vs
 define <vscale x 16 x i8> @replace_asr_intrinsic_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
 ; CHECK-LABEL: define <vscale x 16 x i8> @replace_asr_intrinsic_i8
 ; CHECK-SAME: (<vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.asr.u.nxv16i8(<vscale x 16 x i1> [[TMP1]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
-; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.asr.u.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP1]]
 ;
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
   %2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.asr.nxv16i8(<vscale x 16 x i1> %1, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
@@ -1576,9 +1490,8 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.asr.nxv8i16(<vscale x 8 x i1>, <vsc
 define <vscale x 8 x i16> @replace_asr_intrinsic_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
 ; CHECK-LABEL: define <vscale x 8 x i16> @replace_asr_intrinsic_i16
 ; CHECK-SAME: (<vscale x 8 x i16> [[A:%.*]], <vscale x 8 x i16> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.asr.u.nxv8i16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
-; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.asr.u.nxv8i16(<vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %2 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.asr.nxv8i16(<vscale x 8 x i1> %1, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
@@ -1589,9 +1502,8 @@ declare <vscale x 4 x i32> @llvm.aarch64.sve.asr.nxv4i32(<vscale x 4 x i1>, <vsc
 define <vscale x 4 x i32> @replace_asr_intrinsic_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @replace_asr_intrinsic_i32
 ; CHECK-SAME: (<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.asr.u.nxv4i32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.asr.u.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.asr.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
@@ -1602,9 +1514,8 @@ declare <vscale x 2 x i64> @llvm.aarch64.sve.asr.nxv2i64(<vscale x 2 x i1>, <vsc
 define <vscale x 2 x i64> @replace_asr_intrinsic_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @replace_asr_intrinsic_i64
 ; CHECK-SAME: (<vscale x 2 x i64> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.asr.u.nxv2i64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
-; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.asr.u.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.asr.nxv2i64(<vscale x 2 x i1> %1, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
@@ -1627,9 +1538,8 @@ declare <vscale x 16 x i8> @llvm.aarch64.sve.lsl.nxv16i8(<vscale x 16 x i1>, <vs
 define <vscale x 16 x i8> @replace_lsl_intrinsic_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
 ; CHECK-LABEL: define <vscale x 16 x i8> @replace_lsl_intrinsic_i8
 ; CHECK-SAME: (<vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.lsl.u.nxv16i8(<vscale x 16 x i1> [[TMP1]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
-; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.lsl.u.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP1]]
 ;
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
   %2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.lsl.nxv16i8(<vscale x 16 x i1> %1, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
@@ -1640,9 +1550,8 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.lsl.nxv8i16(<vscale x 8 x i1>, <vsc
 define <vscale x 8 x i16> @replace_lsl_intrinsic_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
 ; CHECK-LABEL: define <vscale x 8 x i16> @replace_lsl_intrinsic_i16
 ; CHECK-SAME: (<vscale x 8 x i16> [[A:%.*]], <vscale x 8 x i16> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.lsl.u.nxv8i16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
-; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.lsl.u.nxv8i16(<vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %2 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.lsl.nxv8i16(<vscale x 8 x i1> %1, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
@@ -1653,9 +1562,8 @@ declare <vscale x 4 x i32> @llvm.aarch64.sve.lsl.nxv4i32(<vscale x 4 x i1>, <vsc
 define <vscale x 4 x i32> @replace_lsl_intrinsic_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @replace_lsl_intrinsic_i32
 ; CHECK-SAME: (<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.lsl.u.nxv4i32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.lsl.u.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.lsl.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
@@ -1666,9 +1574,8 @@ declare <vscale x 2 x i64> @llvm.aarch64.sve.lsl.nxv2i64(<vscale x 2 x i1>, <vsc
 define <vscale x 2 x i64> @replace_lsl_intrinsic_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @replace_lsl_intrinsic_i64
 ; CHECK-SAME: (<vscale x 2 x i64> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.lsl.u.nxv2i64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
-; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.lsl.u.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.lsl.nxv2i64(<vscale x 2 x i1> %1, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
@@ -1691,9 +1598,8 @@ declare <vscale x 16 x i8> @llvm.aarch64.sve.lsr.nxv16i8(<vscale x 16 x i1>, <vs
 define <vscale x 16 x i8> @replace_lsr_intrinsic_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
 ; CHECK-LABEL: define <vscale x 16 x i8> @replace_lsr_intrinsic_i8
 ; CHECK-SAME: (<vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.lsr.u.nxv16i8(<vscale x 16 x i1> [[TMP1]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
-; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.lsr.u.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP1]]
 ;
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
   %2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.lsr.nxv16i8(<vscale x 16 x i1> %1, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
@@ -1704,9 +1610,8 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.lsr.nxv8i16(<vscale x 8 x i1>, <vsc
 define <vscale x 8 x i16> @replace_lsr_intrinsic_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
 ; CHECK-LABEL: define <vscale x 8 x i16> @replace_lsr_intrinsic_i16
 ; CHECK-SAME: (<vscale x 8 x i16> [[A:%.*]], <vscale x 8 x i16> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.lsr.u.nxv8i16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
-; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.lsr.u.nxv8i16(<vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %2 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.lsr.nxv8i16(<vscale x 8 x i1> %1, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
@@ -1717,9 +1622,8 @@ declare <vscale x 4 x i32> @llvm.aarch64.sve.lsr.nxv4i32(<vscale x 4 x i1>, <vsc
 define <vscale x 4 x i32> @replace_lsr_intrinsic_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @replace_lsr_intrinsic_i32
 ; CHECK-SAME: (<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.lsr.u.nxv4i32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.lsr.u.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.lsr.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
@@ -1730,9 +1634,8 @@ declare <vscale x 2 x i64> @llvm.aarch64.sve.lsr.nxv2i64(<vscale x 2 x i1>, <vsc
 define <vscale x 2 x i64> @replace_lsr_intrinsic_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @replace_lsr_intrinsic_i64
 ; CHECK-SAME: (<vscale x 2 x i64> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.lsr.u.nxv2i64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
-; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.lsr.u.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.lsr.nxv2i64(<vscale x 2 x i1> %1, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
@@ -1757,9 +1660,8 @@ declare <vscale x 16 x i8> @llvm.aarch64.sve.and.nxv16i8(<vscale x 16 x i1>, <vs
 define <vscale x 16 x i8> @replace_and_intrinsic_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
 ; CHECK-LABEL: define <vscale x 16 x i8> @replace_and_intrinsic_i8
 ; CHECK-SAME: (<vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.and.u.nxv16i8(<vscale x 16 x i1> [[TMP1]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
-; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.and.u.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP1]]
 ;
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
   %2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.and.nxv16i8(<vscale x 16 x i1> %1, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
@@ -1770,9 +1672,8 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.and.nxv8i16(<vscale x 8 x i1>, <vsc
 define <vscale x 8 x i16> @replace_and_intrinsic_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
 ; CHECK-LABEL: define <vscale x 8 x i16> @replace_and_intrinsic_i16
 ; CHECK-SAME: (<vscale x 8 x i16> [[A:%.*]], <vscale x 8 x i16> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.and.u.nxv8i16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
-; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.and.u.nxv8i16(<vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %2 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.and.nxv8i16(<vscale x 8 x i1> %1, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
@@ -1783,9 +1684,8 @@ declare <vscale x 4 x i32> @llvm.aarch64.sve.and.nxv4i32(<vscale x 4 x i1>, <vsc
 define <vscale x 4 x i32> @replace_and_intrinsic_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @replace_and_intrinsic_i32
 ; CHECK-SAME: (<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.and.u.nxv4i32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.and.u.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.and.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
@@ -1796,9 +1696,8 @@ declare <vscale x 2 x i64> @llvm.aarch64.sve.and.nxv2i64(<vscale x 2 x i1>, <vsc
 define <vscale x 2 x i64> @replace_and_intrinsic_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @replace_and_intrinsic_i64
 ; CHECK-SAME: (<vscale x 2 x i64> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.and.u.nxv2i64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
-; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.and.u.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.and.nxv2i64(<vscale x 2 x i1> %1, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
@@ -1821,9 +1720,8 @@ declare <vscale x 16 x i8> @llvm.aarch64.sve.bic.nxv16i8(<vscale x 16 x i1>, <vs
 define <vscale x 16 x i8> @replace_bic_intrinsic_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
 ; CHECK-LABEL: define <vscale x 16 x i8> @replace_bic_intrinsic_i8
 ; CHECK-SAME: (<vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.bic.u.nxv16i8(<vscale x 16 x i1> [[TMP1]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
-; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.bic.u.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP1]]
 ;
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
   %2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.bic.nxv16i8(<vscale x 16 x i1> %1, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
@@ -1834,9 +1732,8 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.bic.nxv8i16(<vscale x 8 x i1>, <vsc
 define <vscale x 8 x i16> @replace_bic_intrinsic_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
 ; CHECK-LABEL: define <vscale x 8 x i16> @replace_bic_intrinsic_i16
 ; CHECK-SAME: (<vscale x 8 x i16> [[A:%.*]], <vscale x 8 x i16> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.bic.u.nxv8i16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
-; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.bic.u.nxv8i16(<vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %2 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.bic.nxv8i16(<vscale x 8 x i1> %1, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
@@ -1847,9 +1744,8 @@ declare <vscale x 4 x i32> @llvm.aarch64.sve.bic.nxv4i32(<vscale x 4 x i1>, <vsc
 define <vscale x 4 x i32> @replace_bic_intrinsic_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @replace_bic_intrinsic_i32
 ; CHECK-SAME: (<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.bic.u.nxv4i32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.bic.u.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.bic.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
@@ -1860,9 +1756,8 @@ declare <vscale x 2 x i64> @llvm.aarch64.sve.bic.nxv2i64(<vscale x 2 x i1>, <vsc
 define <vscale x 2 x i64> @replace_bic_intrinsic_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @replace_bic_intrinsic_i64
 ; CHECK-SAME: (<vscale x 2 x i64> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.bic.u.nxv2i64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
-; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.bic.u.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.bic.nxv2i64(<vscale x 2 x i1> %1, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
@@ -1885,9 +1780,8 @@ declare <vscale x 16 x i8> @llvm.aarch64.sve.eor.nxv16i8(<vscale x 16 x i1>, <vs
 define <vscale x 16 x i8> @replace_eor_intrinsic_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
 ; CHECK-LABEL: define <vscale x 16 x i8> @replace_eor_intrinsic_i8
 ; CHECK-SAME: (<vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.eor.u.nxv16i8(<vscale x 16 x i1> [[TMP1]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
-; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.eor.u.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP1]]
 ;
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
   %2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.eor.nxv16i8(<vscale x 16 x i1> %1, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
@@ -1898,9 +1792,8 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.eor.nxv8i16(<vscale x 8 x i1>, <vsc
 define <vscale x 8 x i16> @replace_eor_intrinsic_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
 ; CHECK-LABEL: define <vscale x 8 x i16> @replace_eor_intrinsic_i16
 ; CHECK-SAME: (<vscale x 8 x i16> [[A:%.*]], <vscale x 8 x i16> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.eor.u.nxv8i16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
-; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.eor.u.nxv8i16(<vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %2 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.eor.nxv8i16(<vscale x 8 x i1> %1, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
@@ -1911,9 +1804,8 @@ declare <vscale x 4 x i32> @llvm.aarch64.sve.eor.nxv4i32(<vscale x 4 x i1>, <vsc
 define <vscale x 4 x i32> @replace_eor_intrinsic_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @replace_eor_intrinsic_i32
 ; CHECK-SAME: (<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.eor.u.nxv4i32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.eor.u.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.eor.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
@@ -1924,9 +1816,8 @@ declare <vscale x 2 x i64> @llvm.aarch64.sve.eor.nxv2i64(<vscale x 2 x i1>, <vsc
 define <vscale x 2 x i64> @replace_eor_intrinsic_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @replace_eor_intrinsic_i64
 ; CHECK-SAME: (<vscale x 2 x i64> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.eor.u.nxv2i64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
-; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.eor.u.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.eor.nxv2i64(<vscale x 2 x i1> %1, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
@@ -1949,9 +1840,8 @@ declare <vscale x 16 x i8> @llvm.aarch64.sve.orr.nxv16i8(<vscale x 16 x i1>, <vs
 define <vscale x 16 x i8> @replace_orr_intrinsic_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
 ; CHECK-LABEL: define <vscale x 16 x i8> @replace_orr_intrinsic_i8
 ; CHECK-SAME: (<vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.orr.u.nxv16i8(<vscale x 16 x i1> [[TMP1]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
-; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.orr.u.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP1]]
 ;
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
   %2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.orr.nxv16i8(<vscale x 16 x i1> %1, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
@@ -1962,9 +1852,8 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.orr.nxv8i16(<vscale x 8 x i1>, <vsc
 define <vscale x 8 x i16> @replace_orr_intrinsic_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
 ; CHECK-LABEL: define <vscale x 8 x i16> @replace_orr_intrinsic_i16
 ; CHECK-SAME: (<vscale x 8 x i16> [[A:%.*]], <vscale x 8 x i16> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.orr.u.nxv8i16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
-; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.orr.u.nxv8i16(<vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %2 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.orr.nxv8i16(<vscale x 8 x i1> %1, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
@@ -1975,9 +1864,8 @@ declare <vscale x 4 x i32> @llvm.aarch64.sve.orr.nxv4i32(<vscale x 4 x i1>, <vsc
 define <vscale x 4 x i32> @replace_orr_intrinsic_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @replace_orr_intrinsic_i32
 ; CHECK-SAME: (<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.orr.u.nxv4i32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.orr.u.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.orr.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
@@ -1988,9 +1876,8 @@ declare <vscale x 2 x i64> @llvm.aarch64.sve.orr.nxv2i64(<vscale x 2 x i1>, <vsc
 define <vscale x 2 x i64> @replace_orr_intrinsic_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @replace_orr_intrinsic_i64
 ; CHECK-SAME: (<vscale x 2 x i64> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.orr.u.nxv2i64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
-; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.orr.u.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.orr.nxv2i64(<vscale x 2 x i1> %1, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
@@ -2015,9 +1902,8 @@ declare <vscale x 16 x i8> @llvm.aarch64.sve.sqsub.nxv16i8(<vscale x 16 x i1>, <
 define <vscale x 16 x i8> @replace_sqsub_intrinsic_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
 ; CHECK-LABEL: define <vscale x 16 x i8> @replace_sqsub_intrinsic_i8
 ; CHECK-SAME: (<vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.sqsub.u.nxv16i8(<vscale x 16 x i1> [[TMP1]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
-; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.sqsub.u.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP1]]
 ;
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
   %2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.sqsub.nxv16i8(<vscale x 16 x i1> %1, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
@@ -2028,9 +1914,8 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.sqsub.nxv8i16(<vscale x 8 x i1>, <v
 define <vscale x 8 x i16> @replace_sqsub_intrinsic_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
 ; CHECK-LABEL: define <vscale x 8 x i16> @replace_sqsub_intrinsic_i16
 ; CHECK-SAME: (<vscale x 8 x i16> [[A:%.*]], <vscale x 8 x i16> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.sqsub.u.nxv8i16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
-; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.sqsub.u.nxv8i16(<vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %2 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.sqsub.nxv8i16(<vscale x 8 x i1> %1, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
@@ -2041,9 +1926,8 @@ declare <vscale x 4 x i32> @llvm.aarch64.sve.sqsub.nxv4i32(<vscale x 4 x i1>, <v
 define <vscale x 4 x i32> @replace_sqsub_intrinsic_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @replace_sqsub_intrinsic_i32
 ; CHECK-SAME: (<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.sqsub.u.nxv4i32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.sqsub.u.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.sqsub.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
@@ -2054,9 +1938,8 @@ declare <vscale x 2 x i64> @llvm.aarch64.sve.sqsub.nxv2i64(<vscale x 2 x i1>, <v
 define <vscale x 2 x i64> @replace_sqsub_intrinsic_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @replace_sqsub_intrinsic_i64
 ; CHECK-SAME: (<vscale x 2 x i64> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sqsub.u.nxv2i64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
-; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sqsub.u.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sqsub.nxv2i64(<vscale x 2 x i1> %1, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
@@ -2079,9 +1962,8 @@ declare <vscale x 16 x i8> @llvm.aarch64.sve.uqsub.nxv16i8(<vscale x 16 x i1>, <
 define <vscale x 16 x i8> @replace_uqsub_intrinsic_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
 ; CHECK-LABEL: define <vscale x 16 x i8> @replace_uqsub_intrinsic_i8
 ; CHECK-SAME: (<vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.uqsub.u.nxv16i8(<vscale x 16 x i1> [[TMP1]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
-; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.uqsub.u.nxv16i8(<vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP1]]
 ;
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
   %2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.uqsub.nxv16i8(<vscale x 16 x i1> %1, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
@@ -2092,9 +1974,8 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.uqsub.nxv8i16(<vscale x 8 x i1>, <v
 define <vscale x 8 x i16> @replace_uqsub_intrinsic_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
 ; CHECK-LABEL: define <vscale x 8 x i16> @replace_uqsub_intrinsic_i16
 ; CHECK-SAME: (<vscale x 8 x i16> [[A:%.*]], <vscale x 8 x i16> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.uqsub.u.nxv8i16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
-; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.uqsub.u.nxv8i16(<vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 ;
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %2 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.uqsub.nxv8i16(<vscale x 8 x i1> %1, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
@@ -2105,9 +1986,8 @@ declare <vscale x 4 x i32> @llvm.aarch64.sve.uqsub.nxv4i32(<vscale x 4 x i1>, <v
 define <vscale x 4 x i32> @replace_uqsub_intrinsic_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
 ; CHECK-LABEL: define <vscale x 4 x i32> @replace_uqsub_intrinsic_i32
 ; CHECK-SAME: (<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uqsub.u.nxv4i32(<vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uqsub.u.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uqsub.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
@@ -2118,9 +1998,8 @@ declare <vscale x 2 x i64> @llvm.aarch64.sve.uqsub.nxv2i64(<vscale x 2 x i1>, <v
 define <vscale x 2 x i64> @replace_uqsub_intrinsic_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) #0 {
 ; CHECK-LABEL: define <vscale x 2 x i64> @replace_uqsub_intrinsic_i64
 ; CHECK-SAME: (<vscale x 2 x i64> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uqsub.u.nxv2i64(<vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
-; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uqsub.u.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[A]], <vscale x 2 x i64> [[B]])
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 ;
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uqsub.nxv2i64(<vscale x 2 x i1> %1, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsics-ptest.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsics-ptest.ll
index a22454b586c2..e00ad68bdfac 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsics-ptest.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsics-ptest.ll
@@ -39,8 +39,7 @@ define i1 @ptest_any1(<vscale x 2 x i1> %a) #0 {
 ; No transform because the ptest is using differently sized operands.
 define i1 @ptest_any2(<vscale x 4 x i1> %a) #0 {
 ; CHECK-LABEL: @ptest_any2(
-; CHECK-NEXT:    [[MASK:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> [[MASK]])
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
 ; CHECK-NEXT:    [[OUT:%.*]] = call i1 @llvm.aarch64.sve.ptest.any.nxv16i1(<vscale x 16 x i1> [[TMP1]], <vscale x 16 x i1> [[TMP2]])
 ; CHECK-NEXT:    ret i1 [[OUT]]
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsics-rdffr-predication.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsics-rdffr-predication.ll
index 155102db52b5..ff728dc47c66 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsics-rdffr-predication.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsics-rdffr-predication.ll
@@ -6,8 +6,7 @@ target triple = "aarch64-unknown-linux-gnu"
 ; Test that rdffr is substituted with predicated form which enables ptest optimization later.
 define <vscale x 16 x i1> @predicate_rdffr() #0 {
 ; CHECK-LABEL: @predicate_rdffr(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.rdffr.z(<vscale x 16 x i1> [[TMP1]])
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.rdffr.z(<vscale x 16 x i1> splat (i1 true))
 ; CHECK-NEXT:    ret <vscale x 16 x i1> [[OUT]]
 ;
   %out = call <vscale x 16 x i1> @llvm.aarch64.sve.rdffr()
diff --git a/llvm/test/Transforms/InstCombine/array.ll b/llvm/test/Transforms/InstCombine/array.ll
index 3edb47dda62c..a09b66273368 100644
--- a/llvm/test/Transforms/InstCombine/array.ll
+++ b/llvm/test/Transforms/InstCombine/array.ll
@@ -109,6 +109,58 @@ entry:
   ret void
 }
 
+; FIXME: Should be transformed as OR+GEP -> GEP+GEP (similar to gep_inbounds_add_nuw below).
+define ptr @gep_inbounds_nuwaddlike(ptr %ptr, i64 %a, i64 %b) {
+; CHECK-LABEL: define ptr @gep_inbounds_nuwaddlike(
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) {
+; CHECK-NEXT:    [[ADD:%.*]] = or disjoint i64 [[A]], [[B]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw i32, ptr [[PTR]], i64 [[ADD]]
+; CHECK-NEXT:    ret ptr [[GEP]]
+;
+  %add = or disjoint i64 %a, %b
+  %gep = getelementptr inbounds nuw i32, ptr %ptr, i64 %add
+  ret ptr %gep
+}
+
+; FIXME: Preserve "inbounds nuw".
+define ptr @gep_inbounds_add_nuw(ptr %ptr, i64 %a, i64 %b) {
+; CHECK-LABEL: define ptr @gep_inbounds_add_nuw(
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[A]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[TMP1]], i64 [[B]]
+; CHECK-NEXT:    ret ptr [[GEP]]
+;
+  %add = add nuw i64 %a, %b
+  %gep = getelementptr inbounds nuw i32, ptr %ptr, i64 %add
+  ret ptr %gep
+}
+
+; FIXME: Preserve "nusw nuw".
+define ptr @gep_inbounds_add_nusw_nuw(ptr %ptr, i64 %a, i64 %b) {
+; CHECK-LABEL: define ptr @gep_inbounds_add_nusw_nuw(
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[A]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[TMP1]], i64 [[B]]
+; CHECK-NEXT:    ret ptr [[GEP]]
+;
+  %add = add nuw i64 %a, %b
+  %gep = getelementptr nusw nuw i32, ptr %ptr, i64 %add
+  ret ptr %gep
+}
+
+; FIXME: Preserve "nuw".
+define ptr @gep_add_nuw(ptr %ptr, i64 %a, i64 %b) {
+; CHECK-LABEL: define ptr @gep_add_nuw(
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[A]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[TMP1]], i64 [[B]]
+; CHECK-NEXT:    ret ptr [[GEP]]
+;
+  %add = add nuw i64 %a, %b
+  %gep = getelementptr nuw i32, ptr %ptr, i64 %add
+  ret ptr %gep
+}
+
 define ptr @gep_inbounds_add_nsw_nonneg(ptr %ptr, i64 %a, i64 %b) {
 ; CHECK-LABEL: define ptr @gep_inbounds_add_nsw_nonneg(
 ; CHECK-SAME: ptr [[PTR:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) {
@@ -219,6 +271,27 @@ define ptr @gep_inbounds_sext_add_nonneg(ptr %ptr, i32 %a) {
   ret ptr %gep
 }
 
+; FIXME: Could be optimized similar to gep_inbounds_sext_add_nonneg above
+;        (difference is that we are using disjoint OR which is canonical form
+;        of ADD with disjoint operands).
+define ptr @gep_inbounds_sext_addlike_nonneg(ptr %ptr, i32 %a) {
+; CHECK-LABEL: define ptr @gep_inbounds_sext_addlike_nonneg(
+; CHECK-SAME: ptr [[PTR:%.*]], i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[A_NNEG:%.*]] = icmp sgt i32 [[A]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[A_NNEG]])
+; CHECK-NEXT:    [[ADD:%.*]] = or disjoint i32 [[A]], 10
+; CHECK-NEXT:    [[IDX:%.*]] = zext nneg i32 [[ADD]] to i64
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw i32, ptr [[PTR]], i64 [[IDX]]
+; CHECK-NEXT:    ret ptr [[GEP]]
+;
+  %a.nneg = icmp sgt i32 %a, -1
+  call void @llvm.assume(i1 %a.nneg)
+  %add = or disjoint i32 %a, 10
+  %idx = sext i32 %add to i64
+  %gep = getelementptr inbounds i32, ptr %ptr, i64 %idx
+  ret ptr %gep
+}
+
 define ptr @gep_inbounds_sext_add_not_nonneg_1(ptr %ptr, i32 %a) {
 ; CHECK-LABEL: define ptr @gep_inbounds_sext_add_not_nonneg_1(
 ; CHECK-SAME: ptr [[PTR:%.*]], i32 [[A:%.*]]) {
diff --git a/llvm/test/Transforms/InstCombine/fabs-as-int.ll b/llvm/test/Transforms/InstCombine/fabs-as-int.ll
index 9d28cae8f04d..f0e83ca6302f 100644
--- a/llvm/test/Transforms/InstCombine/fabs-as-int.ll
+++ b/llvm/test/Transforms/InstCombine/fabs-as-int.ll
@@ -289,8 +289,8 @@ define i128 @fabs_as_int_ppc_fp128_f64_mask(ppc_fp128 %x) {
 define i128 @fabs_as_int_ppc_fp128_f128_mask(ppc_fp128 %x) {
 ; CHECK-LABEL: define i128 @fabs_as_int_ppc_fp128_f128_mask
 ; CHECK-SAME: (ppc_fp128 [[X:%.*]]) {
-; CHECK-NEXT:    [[BC:%.*]] = bitcast ppc_fp128 [[X]] to i128
-; CHECK-NEXT:    [[AND:%.*]] = and i128 [[BC]], 170141183460469231731687303715884105727
+; CHECK-NEXT:    [[TMP1:%.*]] = call ppc_fp128 @llvm.fabs.ppcf128(ppc_fp128 [[X]])
+; CHECK-NEXT:    [[AND:%.*]] = bitcast ppc_fp128 [[TMP1]] to i128
 ; CHECK-NEXT:    ret i128 [[AND]]
 ;
   %bc = bitcast ppc_fp128 %x to i128
diff --git a/llvm/test/Transforms/InstCombine/fneg-as-int.ll b/llvm/test/Transforms/InstCombine/fneg-as-int.ll
index f8d88b4f238f..590aca687e5b 100644
--- a/llvm/test/Transforms/InstCombine/fneg-as-int.ll
+++ b/llvm/test/Transforms/InstCombine/fneg-as-int.ll
@@ -291,8 +291,8 @@ define i128 @fneg_as_int_ppc_fp128_f64_mask(ppc_fp128 %x) {
 define i128 @fneg_as_int_ppc_fp128_f128_mask(ppc_fp128 %x) {
 ; CHECK-LABEL: define i128 @fneg_as_int_ppc_fp128_f128_mask
 ; CHECK-SAME: (ppc_fp128 [[X:%.*]]) {
-; CHECK-NEXT:    [[BC:%.*]] = bitcast ppc_fp128 [[X]] to i128
-; CHECK-NEXT:    [[XOR:%.*]] = xor i128 [[BC]], -170141183460469231731687303715884105728
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg ppc_fp128 [[X]]
+; CHECK-NEXT:    [[XOR:%.*]] = bitcast ppc_fp128 [[TMP1]] to i128
 ; CHECK-NEXT:    ret i128 [[XOR]]
 ;
   %bc = bitcast ppc_fp128 %x to i128
diff --git a/llvm/test/Transforms/InstCombine/fneg-fabs-as-int.ll b/llvm/test/Transforms/InstCombine/fneg-fabs-as-int.ll
index 8b245bdd7929..a0894c3febc9 100644
--- a/llvm/test/Transforms/InstCombine/fneg-fabs-as-int.ll
+++ b/llvm/test/Transforms/InstCombine/fneg-fabs-as-int.ll
@@ -317,8 +317,9 @@ define i128 @fneg_fabs_as_int_ppc_fp128_f64_mask(ppc_fp128 %x) {
 define i128 @fneg_fabs_as_int_ppc_fp128_f128_mask(ppc_fp128 %x) {
 ; CHECK-LABEL: define i128 @fneg_fabs_as_int_ppc_fp128_f128_mask
 ; CHECK-SAME: (ppc_fp128 [[X:%.*]]) {
-; CHECK-NEXT:    [[BC:%.*]] = bitcast ppc_fp128 [[X]] to i128
-; CHECK-NEXT:    [[OR:%.*]] = or i128 [[BC]], -170141183460469231731687303715884105728
+; CHECK-NEXT:    [[TMP1:%.*]] = call ppc_fp128 @llvm.fabs.ppcf128(ppc_fp128 [[X]])
+; CHECK-NEXT:    [[TMP2:%.*]] = fneg ppc_fp128 [[TMP1]]
+; CHECK-NEXT:    [[OR:%.*]] = bitcast ppc_fp128 [[TMP2]] to i128
 ; CHECK-NEXT:    ret i128 [[OR]]
 ;
   %bc = bitcast ppc_fp128 %x to i128
diff --git a/llvm/test/Transforms/InstCombine/sincospi.ll b/llvm/test/Transforms/InstCombine/sincospi.ll
index b76ae2017114..14da03dff6f4 100644
--- a/llvm/test/Transforms/InstCombine/sincospi.ll
+++ b/llvm/test/Transforms/InstCombine/sincospi.ll
@@ -90,18 +90,14 @@ define float @test_instbased_f32_other_user(ptr %ptr) {
 
 define float @test_constant_f32() {
 ; CHECK-FLOAT-IN-VEC-LABEL: @test_constant_f32(
-; CHECK-FLOAT-IN-VEC-NEXT:    [[SINCOSPI:%.*]] = call <2 x float> @__sincospif_stret(float 1.000000e+00)
-; CHECK-FLOAT-IN-VEC-NEXT:    [[SINPI:%.*]] = extractelement <2 x float> [[SINCOSPI]], i64 0
-; CHECK-FLOAT-IN-VEC-NEXT:    [[COSPI:%.*]] = extractelement <2 x float> [[SINCOSPI]], i64 1
-; CHECK-FLOAT-IN-VEC-NEXT:    [[COS:%.*]] = call float @__cospif(float 1.000000e+00) #[[ATTR0]]
+; CHECK-FLOAT-IN-VEC-NEXT:    [[SINPI:%.*]] = call float @__sinpif(float 1.000000e+00) #[[ATTR0]]
+; CHECK-FLOAT-IN-VEC-NEXT:    [[COSPI:%.*]] = call float @__cospif(float 1.000000e+00) #[[ATTR0]]
 ; CHECK-FLOAT-IN-VEC-NEXT:    [[RES:%.*]] = fadd float [[SINPI]], [[COSPI]]
 ; CHECK-FLOAT-IN-VEC-NEXT:    ret float [[RES]]
 ;
 ; CHECK-LABEL: @test_constant_f32(
-; CHECK-NEXT:    [[SINCOSPI:%.*]] = call { float, float } @__sincospif_stret(float 1.000000e+00)
-; CHECK-NEXT:    [[SINPI:%.*]] = extractvalue { float, float } [[SINCOSPI]], 0
-; CHECK-NEXT:    [[COSPI:%.*]] = extractvalue { float, float } [[SINCOSPI]], 1
-; CHECK-NEXT:    [[COS:%.*]] = call float @__cospif(float 1.000000e+00) #[[ATTR0]]
+; CHECK-NEXT:    [[SINPI:%.*]] = call float @__sinpif(float 1.000000e+00) #[[ATTR0]]
+; CHECK-NEXT:    [[COSPI:%.*]] = call float @__cospif(float 1.000000e+00) #[[ATTR0]]
 ; CHECK-NEXT:    [[RES:%.*]] = fadd float [[SINPI]], [[COSPI]]
 ; CHECK-NEXT:    ret float [[RES]]
 ;
@@ -172,18 +168,14 @@ define double @test_instbased_f64() {
 
 define double @test_constant_f64() {
 ; CHECK-FLOAT-IN-VEC-LABEL: @test_constant_f64(
-; CHECK-FLOAT-IN-VEC-NEXT:    [[SINCOSPI:%.*]] = call { double, double } @__sincospi_stret(double 1.000000e+00)
-; CHECK-FLOAT-IN-VEC-NEXT:    [[SINPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 0
-; CHECK-FLOAT-IN-VEC-NEXT:    [[COSPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 1
-; CHECK-FLOAT-IN-VEC-NEXT:    [[COS:%.*]] = call double @__cospi(double 1.000000e+00) #[[ATTR0]]
+; CHECK-FLOAT-IN-VEC-NEXT:    [[SINPI:%.*]] = call double @__sinpi(double 1.000000e+00) #[[ATTR0]]
+; CHECK-FLOAT-IN-VEC-NEXT:    [[COSPI:%.*]] = call double @__cospi(double 1.000000e+00) #[[ATTR0]]
 ; CHECK-FLOAT-IN-VEC-NEXT:    [[RES:%.*]] = fadd double [[SINPI]], [[COSPI]]
 ; CHECK-FLOAT-IN-VEC-NEXT:    ret double [[RES]]
 ;
 ; CHECK-LABEL: @test_constant_f64(
-; CHECK-NEXT:    [[SINCOSPI:%.*]] = call { double, double } @__sincospi_stret(double 1.000000e+00)
-; CHECK-NEXT:    [[SINPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 0
-; CHECK-NEXT:    [[COSPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 1
-; CHECK-NEXT:    [[COS:%.*]] = call double @__cospi(double 1.000000e+00) #[[ATTR0]]
+; CHECK-NEXT:    [[SINPI:%.*]] = call double @__sinpi(double 1.000000e+00) #[[ATTR0]]
+; CHECK-NEXT:    [[COSPI:%.*]] = call double @__cospi(double 1.000000e+00) #[[ATTR0]]
 ; CHECK-NEXT:    [[RES:%.*]] = fadd double [[SINPI]], [[COSPI]]
 ; CHECK-NEXT:    ret double [[RES]]
 ;
diff --git a/llvm/test/Transforms/LICM/pr50367.ll b/llvm/test/Transforms/LICM/pr50367.ll
index a7cf21deff62..7fd176b6c6bb 100644
--- a/llvm/test/Transforms/LICM/pr50367.ll
+++ b/llvm/test/Transforms/LICM/pr50367.ll
@@ -2,7 +2,7 @@
 ; RUN: opt -S -passes='loop-mssa(licm)' < %s | FileCheck %s
 @e = external dso_local global ptr, align 8
 
-define void @main(i1 %arg) {
+define void @main(i1 %arg, ptr %arg1) {
 ; CHECK-LABEL: @main(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP1:%.*]]
@@ -11,8 +11,47 @@ define void @main(i1 %arg) {
 ; CHECK:       loop2:
 ; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[LOOP2_LATCH:%.*]], label [[LOOP_LATCH:%.*]]
 ; CHECK:       loop2.latch:
+; CHECK-NEXT:    store i32 0, ptr [[ARG1:%.*]], align 4
 ; CHECK-NEXT:    br label [[LOOP2]]
 ; CHECK:       loop.latch:
+; CHECK-NEXT:    store ptr null, ptr @e, align 8, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[PTR:%.*]] = load ptr, ptr @e, align 8, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store i32 0, ptr [[PTR]], align 4, !tbaa [[TBAA4:![0-9]+]]
+; CHECK-NEXT:    br label [[LOOP1]]
+;
+entry:
+  br label %loop1
+
+loop1:
+  br label %loop2
+
+loop2:
+  br i1 %arg, label %loop2.latch, label %loop.latch
+
+loop2.latch:
+  store i32 0, ptr %arg1, align 4
+  br label %loop2
+
+loop.latch:
+  store ptr null, ptr @e, align 8, !tbaa !0
+  %ptr = load ptr, ptr @e, align 8, !tbaa !0
+  store i32 0, ptr %ptr, align 4, !tbaa !4
+  br label %loop1
+}
+
+define void @store_null(i1 %arg) {
+; CHECK-LABEL: @store_null(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP1:%.*]]
+; CHECK:       loop1:
+; CHECK-NEXT:    br label [[LOOP2:%.*]]
+; CHECK:       loop2:
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[LOOP2_LATCH:%.*]], label [[LOOP_LATCH:%.*]]
+; CHECK:       loop2.latch:
+; CHECK-NEXT:    store i32 0, ptr null, align 4
+; CHECK-NEXT:    br label [[LOOP2]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    store i32 0, ptr null, align 4, !tbaa [[TBAA4]]
 ; CHECK-NEXT:    br label [[LOOP1]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LICM/pr59324.ll b/llvm/test/Transforms/LICM/pr59324.ll
index b0ad60e65069..ec33a0f8ded0 100644
--- a/llvm/test/Transforms/LICM/pr59324.ll
+++ b/llvm/test/Transforms/LICM/pr59324.ll
@@ -6,7 +6,10 @@ define void @test(ptr %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[V:%.*]] = load i32, ptr null, align 4
+; CHECK-NEXT:    store ptr null, ptr null, align 8
+; CHECK-NEXT:    [[P:%.*]] = load ptr, ptr null, align 8
+; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT:    store i32 [[V]], ptr [[A:%.*]], align 4
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
 entry:
@@ -19,3 +22,25 @@ loop:
   store i32 %v, ptr %a
   br label %loop
 }
+
+define void @test_inttoptr(ptr %a) {
+; CHECK-LABEL: @test_inttoptr(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    store ptr null, ptr inttoptr (i64 128 to ptr), align 8
+; CHECK-NEXT:    [[P:%.*]] = load ptr, ptr inttoptr (i64 128 to ptr), align 8
+; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT:    store i32 [[V]], ptr [[A:%.*]], align 4
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  store ptr null, ptr inttoptr (i64 128 to ptr)
+  %p = load ptr, ptr inttoptr (i64 128 to ptr)
+  %v = load i32, ptr %p
+  store i32 %v, ptr %a
+  br label %loop
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
index f01aaa04606d..a28673cf8e55 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
@@ -429,7 +429,249 @@ exit:
   ret void
 }
 
+define void @vector_reverse_irregular_type(ptr noalias %A, ptr noalias %B) {
+; RV64-LABEL: define void @vector_reverse_irregular_type(
+; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; RV64-NEXT:  [[ENTRY:.*]]:
+; RV64-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV64:       [[VECTOR_PH]]:
+; RV64-NEXT:    br label %[[VECTOR_BODY:.*]]
+; RV64:       [[VECTOR_BODY]]:
+; RV64-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; RV64-NEXT:    [[DEC_IV:%.*]] = add i64 [[OFFSET_IDX]], 0
+; RV64-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -1
+; RV64-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], -2
+; RV64-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], -3
+; RV64-NEXT:    [[IV_NEXT:%.*]] = add nsw i64 [[DEC_IV]], -1
+; RV64-NEXT:    [[TMP5:%.*]] = add nsw i64 [[TMP1]], -1
+; RV64-NEXT:    [[TMP6:%.*]] = add nsw i64 [[TMP2]], -1
+; RV64-NEXT:    [[TMP7:%.*]] = add nsw i64 [[TMP3]], -1
+; RV64-NEXT:    [[ARRAYIDX_B:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[IV_NEXT]]
+; RV64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP5]]
+; RV64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP6]]
+; RV64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP7]]
+; RV64-NEXT:    [[TMP0:%.*]] = load i7, ptr [[ARRAYIDX_B]], align 1
+; RV64-NEXT:    [[TMP13:%.*]] = load i7, ptr [[TMP9]], align 1
+; RV64-NEXT:    [[TMP14:%.*]] = load i7, ptr [[TMP10]], align 1
+; RV64-NEXT:    [[TMP15:%.*]] = load i7, ptr [[TMP11]], align 1
+; RV64-NEXT:    [[TMP16:%.*]] = insertelement <4 x i7> poison, i7 [[TMP0]], i32 0
+; RV64-NEXT:    [[TMP17:%.*]] = insertelement <4 x i7> [[TMP16]], i7 [[TMP13]], i32 1
+; RV64-NEXT:    [[TMP18:%.*]] = insertelement <4 x i7> [[TMP17]], i7 [[TMP14]], i32 2
+; RV64-NEXT:    [[TMP19:%.*]] = insertelement <4 x i7> [[TMP18]], i7 [[TMP15]], i32 3
+; RV64-NEXT:    [[TMP20:%.*]] = add <4 x i7> [[TMP19]], splat (i7 1)
+; RV64-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[IV_NEXT]]
+; RV64-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP5]]
+; RV64-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP6]]
+; RV64-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP7]]
+; RV64-NEXT:    [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0
+; RV64-NEXT:    store i7 [[TMP25]], ptr [[TMP21]], align 1
+; RV64-NEXT:    [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1
+; RV64-NEXT:    store i7 [[TMP26]], ptr [[TMP22]], align 1
+; RV64-NEXT:    [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2
+; RV64-NEXT:    store i7 [[TMP27]], ptr [[TMP23]], align 1
+; RV64-NEXT:    [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3
+; RV64-NEXT:    store i7 [[TMP28]], ptr [[TMP24]], align 1
+; RV64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; RV64-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020
+; RV64-NEXT:    br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; RV64:       [[MIDDLE_BLOCK]]:
+; RV64-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; RV64:       [[SCALAR_PH]]:
+; RV64-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
+; RV64-NEXT:    br label %[[FOR_BODY:.*]]
+; RV64:       [[FOR_BODY]]:
+; RV64-NEXT:    [[DEC_IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], %[[FOR_BODY]] ]
+; RV64-NEXT:    [[IV_NEXT1]] = add nsw i64 [[DEC_IV1]], -1
+; RV64-NEXT:    [[ARRAYIDX_B1:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[IV_NEXT1]]
+; RV64-NEXT:    [[TMP30:%.*]] = load i7, ptr [[ARRAYIDX_B1]], align 1
+; RV64-NEXT:    [[ADD:%.*]] = add i7 [[TMP30]], 1
+; RV64-NEXT:    [[ARRAYIDX_A:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[IV_NEXT1]]
+; RV64-NEXT:    store i7 [[ADD]], ptr [[ARRAYIDX_A]], align 1
+; RV64-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[DEC_IV1]], 1
+; RV64-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; RV64:       [[EXIT]]:
+; RV64-NEXT:    ret void
+;
+; RV32-LABEL: define void @vector_reverse_irregular_type(
+; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; RV32-NEXT:  [[ENTRY:.*]]:
+; RV32-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV32:       [[VECTOR_PH]]:
+; RV32-NEXT:    br label %[[VECTOR_BODY:.*]]
+; RV32:       [[VECTOR_BODY]]:
+; RV32-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV32-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; RV32-NEXT:    [[DEC_IV:%.*]] = add i64 [[OFFSET_IDX]], 0
+; RV32-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -1
+; RV32-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], -2
+; RV32-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], -3
+; RV32-NEXT:    [[IV_NEXT:%.*]] = add nsw i64 [[DEC_IV]], -1
+; RV32-NEXT:    [[TMP5:%.*]] = add nsw i64 [[TMP1]], -1
+; RV32-NEXT:    [[TMP6:%.*]] = add nsw i64 [[TMP2]], -1
+; RV32-NEXT:    [[TMP7:%.*]] = add nsw i64 [[TMP3]], -1
+; RV32-NEXT:    [[ARRAYIDX_B:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[IV_NEXT]]
+; RV32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP5]]
+; RV32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP6]]
+; RV32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP7]]
+; RV32-NEXT:    [[TMP0:%.*]] = load i7, ptr [[ARRAYIDX_B]], align 1
+; RV32-NEXT:    [[TMP13:%.*]] = load i7, ptr [[TMP9]], align 1
+; RV32-NEXT:    [[TMP14:%.*]] = load i7, ptr [[TMP10]], align 1
+; RV32-NEXT:    [[TMP15:%.*]] = load i7, ptr [[TMP11]], align 1
+; RV32-NEXT:    [[TMP16:%.*]] = insertelement <4 x i7> poison, i7 [[TMP0]], i32 0
+; RV32-NEXT:    [[TMP17:%.*]] = insertelement <4 x i7> [[TMP16]], i7 [[TMP13]], i32 1
+; RV32-NEXT:    [[TMP18:%.*]] = insertelement <4 x i7> [[TMP17]], i7 [[TMP14]], i32 2
+; RV32-NEXT:    [[TMP19:%.*]] = insertelement <4 x i7> [[TMP18]], i7 [[TMP15]], i32 3
+; RV32-NEXT:    [[TMP20:%.*]] = add <4 x i7> [[TMP19]], splat (i7 1)
+; RV32-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[IV_NEXT]]
+; RV32-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP5]]
+; RV32-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP6]]
+; RV32-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP7]]
+; RV32-NEXT:    [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0
+; RV32-NEXT:    store i7 [[TMP25]], ptr [[TMP21]], align 1
+; RV32-NEXT:    [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1
+; RV32-NEXT:    store i7 [[TMP26]], ptr [[TMP22]], align 1
+; RV32-NEXT:    [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2
+; RV32-NEXT:    store i7 [[TMP27]], ptr [[TMP23]], align 1
+; RV32-NEXT:    [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3
+; RV32-NEXT:    store i7 [[TMP28]], ptr [[TMP24]], align 1
+; RV32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; RV32-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020
+; RV32-NEXT:    br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; RV32:       [[MIDDLE_BLOCK]]:
+; RV32-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; RV32:       [[SCALAR_PH]]:
+; RV32-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
+; RV32-NEXT:    br label %[[FOR_BODY:.*]]
+; RV32:       [[FOR_BODY]]:
+; RV32-NEXT:    [[DEC_IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], %[[FOR_BODY]] ]
+; RV32-NEXT:    [[IV_NEXT1]] = add nsw i64 [[DEC_IV1]], -1
+; RV32-NEXT:    [[ARRAYIDX_B1:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[IV_NEXT1]]
+; RV32-NEXT:    [[TMP30:%.*]] = load i7, ptr [[ARRAYIDX_B1]], align 1
+; RV32-NEXT:    [[ADD:%.*]] = add i7 [[TMP30]], 1
+; RV32-NEXT:    [[ARRAYIDX_A:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[IV_NEXT1]]
+; RV32-NEXT:    store i7 [[ADD]], ptr [[ARRAYIDX_A]], align 1
+; RV32-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[DEC_IV1]], 1
+; RV32-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; RV32:       [[EXIT]]:
+; RV32-NEXT:    ret void
+;
+; RV64-UF2-LABEL: define void @vector_reverse_irregular_type(
+; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; RV64-UF2-NEXT:  [[ENTRY:.*]]:
+; RV64-UF2-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV64-UF2:       [[VECTOR_PH]]:
+; RV64-UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; RV64-UF2:       [[VECTOR_BODY]]:
+; RV64-UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-UF2-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; RV64-UF2-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 0
+; RV64-UF2-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], -1
+; RV64-UF2-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], -2
+; RV64-UF2-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], -3
+; RV64-UF2-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], -4
+; RV64-UF2-NEXT:    [[TMP42:%.*]] = add i64 [[OFFSET_IDX]], -5
+; RV64-UF2-NEXT:    [[TMP43:%.*]] = add i64 [[OFFSET_IDX]], -6
+; RV64-UF2-NEXT:    [[TMP50:%.*]] = add i64 [[OFFSET_IDX]], -7
+; RV64-UF2-NEXT:    [[TMP1:%.*]] = add nsw i64 [[TMP16]], -1
+; RV64-UF2-NEXT:    [[TMP2:%.*]] = add nsw i64 [[TMP0]], -1
+; RV64-UF2-NEXT:    [[TMP51:%.*]] = add nsw i64 [[TMP17]], -1
+; RV64-UF2-NEXT:    [[TMP11:%.*]] = add nsw i64 [[TMP24]], -1
+; RV64-UF2-NEXT:    [[TMP59:%.*]] = add nsw i64 [[TMP25]], -1
+; RV64-UF2-NEXT:    [[TMP13:%.*]] = add nsw i64 [[TMP42]], -1
+; RV64-UF2-NEXT:    [[TMP14:%.*]] = add nsw i64 [[TMP43]], -1
+; RV64-UF2-NEXT:    [[TMP15:%.*]] = add nsw i64 [[TMP50]], -1
+; RV64-UF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP1]]
+; RV64-UF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP2]]
+; RV64-UF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP51]]
+; RV64-UF2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP11]]
+; RV64-UF2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP59]]
+; RV64-UF2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP13]]
+; RV64-UF2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP14]]
+; RV64-UF2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP15]]
+; RV64-UF2-NEXT:    [[TMP5:%.*]] = load i7, ptr [[TMP3]], align 1
+; RV64-UF2-NEXT:    [[TMP6:%.*]] = load i7, ptr [[TMP4]], align 1
+; RV64-UF2-NEXT:    [[TMP26:%.*]] = load i7, ptr [[TMP18]], align 1
+; RV64-UF2-NEXT:    [[TMP27:%.*]] = load i7, ptr [[TMP19]], align 1
+; RV64-UF2-NEXT:    [[TMP28:%.*]] = insertelement <4 x i7> poison, i7 [[TMP5]], i32 0
+; RV64-UF2-NEXT:    [[TMP29:%.*]] = insertelement <4 x i7> [[TMP28]], i7 [[TMP6]], i32 1
+; RV64-UF2-NEXT:    [[TMP30:%.*]] = insertelement <4 x i7> [[TMP29]], i7 [[TMP26]], i32 2
+; RV64-UF2-NEXT:    [[TMP31:%.*]] = insertelement <4 x i7> [[TMP30]], i7 [[TMP27]], i32 3
+; RV64-UF2-NEXT:    [[TMP32:%.*]] = load i7, ptr [[TMP20]], align 1
+; RV64-UF2-NEXT:    [[TMP33:%.*]] = load i7, ptr [[TMP21]], align 1
+; RV64-UF2-NEXT:    [[TMP34:%.*]] = load i7, ptr [[TMP22]], align 1
+; RV64-UF2-NEXT:    [[TMP35:%.*]] = load i7, ptr [[TMP23]], align 1
+; RV64-UF2-NEXT:    [[TMP36:%.*]] = insertelement <4 x i7> poison, i7 [[TMP32]], i32 0
+; RV64-UF2-NEXT:    [[TMP37:%.*]] = insertelement <4 x i7> [[TMP36]], i7 [[TMP33]], i32 1
+; RV64-UF2-NEXT:    [[TMP38:%.*]] = insertelement <4 x i7> [[TMP37]], i7 [[TMP34]], i32 2
+; RV64-UF2-NEXT:    [[TMP39:%.*]] = insertelement <4 x i7> [[TMP38]], i7 [[TMP35]], i32 3
+; RV64-UF2-NEXT:    [[TMP40:%.*]] = add <4 x i7> [[TMP31]], splat (i7 1)
+; RV64-UF2-NEXT:    [[TMP41:%.*]] = add <4 x i7> [[TMP39]], splat (i7 1)
+; RV64-UF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP1]]
+; RV64-UF2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP2]]
+; RV64-UF2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP51]]
+; RV64-UF2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP11]]
+; RV64-UF2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP59]]
+; RV64-UF2-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP13]]
+; RV64-UF2-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP14]]
+; RV64-UF2-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP15]]
+; RV64-UF2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i7> [[TMP40]], i32 0
+; RV64-UF2-NEXT:    store i7 [[TMP7]], ptr [[TMP9]], align 1
+; RV64-UF2-NEXT:    [[TMP8:%.*]] = extractelement <4 x i7> [[TMP40]], i32 1
+; RV64-UF2-NEXT:    store i7 [[TMP8]], ptr [[TMP10]], align 1
+; RV64-UF2-NEXT:    [[TMP52:%.*]] = extractelement <4 x i7> [[TMP40]], i32 2
+; RV64-UF2-NEXT:    store i7 [[TMP52]], ptr [[TMP44]], align 1
+; RV64-UF2-NEXT:    [[TMP53:%.*]] = extractelement <4 x i7> [[TMP40]], i32 3
+; RV64-UF2-NEXT:    store i7 [[TMP53]], ptr [[TMP45]], align 1
+; RV64-UF2-NEXT:    [[TMP54:%.*]] = extractelement <4 x i7> [[TMP41]], i32 0
+; RV64-UF2-NEXT:    store i7 [[TMP54]], ptr [[TMP46]], align 1
+; RV64-UF2-NEXT:    [[TMP55:%.*]] = extractelement <4 x i7> [[TMP41]], i32 1
+; RV64-UF2-NEXT:    store i7 [[TMP55]], ptr [[TMP47]], align 1
+; RV64-UF2-NEXT:    [[TMP56:%.*]] = extractelement <4 x i7> [[TMP41]], i32 2
+; RV64-UF2-NEXT:    store i7 [[TMP56]], ptr [[TMP48]], align 1
+; RV64-UF2-NEXT:    [[TMP57:%.*]] = extractelement <4 x i7> [[TMP41]], i32 3
+; RV64-UF2-NEXT:    store i7 [[TMP57]], ptr [[TMP49]], align 1
+; RV64-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; RV64-UF2-NEXT:    [[TMP58:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1016
+; RV64-UF2-NEXT:    br i1 [[TMP58]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; RV64-UF2:       [[MIDDLE_BLOCK]]:
+; RV64-UF2-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; RV64-UF2:       [[SCALAR_PH]]:
+; RV64-UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 7, %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
+; RV64-UF2-NEXT:    br label %[[FOR_BODY:.*]]
+; RV64-UF2:       [[FOR_BODY]]:
+; RV64-UF2-NEXT:    [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; RV64-UF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
+; RV64-UF2-NEXT:    [[ARRAYIDX_B:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[IV_NEXT]]
+; RV64-UF2-NEXT:    [[TMP12:%.*]] = load i7, ptr [[ARRAYIDX_B]], align 1
+; RV64-UF2-NEXT:    [[ADD:%.*]] = add i7 [[TMP12]], 1
+; RV64-UF2-NEXT:    [[ARRAYIDX_A:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[IV_NEXT]]
+; RV64-UF2-NEXT:    store i7 [[ADD]], ptr [[ARRAYIDX_A]], align 1
+; RV64-UF2-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
+; RV64-UF2-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; RV64-UF2:       [[EXIT]]:
+; RV64-UF2-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %dec.iv = phi i64 [ 1023, %entry ], [ %iv.next, %for.body ]
+  %iv.next = add nsw i64 %dec.iv, -1
+  %arrayidx.b = getelementptr inbounds i7, ptr %B, i64 %iv.next
+  %0 = load i7, ptr %arrayidx.b, align 1
+  %add = add i7 %0, 1
+  %arrayidx.a = getelementptr inbounds i7, ptr %A, i64 %iv.next
+  store i7 %add, ptr %arrayidx.a, align 1
+  %cmp = icmp ugt i64 %dec.iv, 1
+  br i1 %cmp, label %for.body, label %exit, !llvm.loop !4
+
+exit:
+  ret void
+}
+
 !0 = distinct !{!0, !1, !2, !3}
 !1 = !{!"llvm.loop.vectorize.width", i32 4}
 !2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
 !3 = !{!"llvm.loop.vectorize.enable", i1 true}
+!4 = distinct !{!4, !1, !3}
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll
index bc0ccfb45c05..02a876a3fda6 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll
@@ -7,112 +7,86 @@ define void @test(ptr %p, i40 %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i40> poison, i40 [[A]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i40> [[BROADCAST_SPLATINSERT1]], <16 x i40> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <16 x i40> [[BROADCAST_SPLAT2]], splat (i40 24)
-; CHECK-NEXT:    [[TMP2:%.*]] = ashr <16 x i40> [[TMP1]], splat (i40 28)
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc <16 x i40> [[TMP2]] to <16 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc <16 x i32> [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <16 x i1> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult <16 x i1> zeroinitializer, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i1> [[TMP6]], splat (i1 true)
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt <16 x i1> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; CHECK:       pred.store.if:
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
-; CHECK-NEXT:    store i1 [[TMP10]], ptr [[P]], align 1
+; CHECK-NEXT:    store i1 false, ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
 ; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
 ; CHECK:       pred.store.if1:
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x i1> [[TMP8]], i32 1
-; CHECK-NEXT:    store i1 [[TMP9]], ptr [[P]], align 1
+; CHECK-NEXT:    store i1 false, ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE2]]
 ; CHECK:       pred.store.continue2:
 ; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
 ; CHECK:       pred.store.if3:
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x i1> [[TMP8]], i32 2
-; CHECK-NEXT:    store i1 [[TMP12]], ptr [[P]], align 1
+; CHECK-NEXT:    store i1 false, ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE4]]
 ; CHECK:       pred.store.continue4:
 ; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
 ; CHECK:       pred.store.if5:
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x i1> [[TMP8]], i32 3
-; CHECK-NEXT:    store i1 [[TMP14]], ptr [[P]], align 1
+; CHECK-NEXT:    store i1 false, ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE6]]
 ; CHECK:       pred.store.continue6:
 ; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
 ; CHECK:       pred.store.if7:
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x i1> [[TMP8]], i32 4
-; CHECK-NEXT:    store i1 [[TMP16]], ptr [[P]], align 1
+; CHECK-NEXT:    store i1 false, ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
 ; CHECK:       pred.store.continue8:
 ; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
 ; CHECK:       pred.store.if9:
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <16 x i1> [[TMP8]], i32 5
-; CHECK-NEXT:    store i1 [[TMP18]], ptr [[P]], align 1
+; CHECK-NEXT:    store i1 false, ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE10]]
 ; CHECK:       pred.store.continue10:
 ; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
 ; CHECK:       pred.store.if11:
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <16 x i1> [[TMP8]], i32 6
-; CHECK-NEXT:    store i1 [[TMP20]], ptr [[P]], align 1
+; CHECK-NEXT:    store i1 false, ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE12]]
 ; CHECK:       pred.store.continue12:
 ; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]]
 ; CHECK:       pred.store.if13:
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <16 x i1> [[TMP8]], i32 7
-; CHECK-NEXT:    store i1 [[TMP22]], ptr [[P]], align 1
+; CHECK-NEXT:    store i1 false, ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE14]]
 ; CHECK:       pred.store.continue14:
 ; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16:%.*]]
 ; CHECK:       pred.store.if15:
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <16 x i1> [[TMP8]], i32 8
-; CHECK-NEXT:    store i1 [[TMP24]], ptr [[P]], align 1
+; CHECK-NEXT:    store i1 false, ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE16]]
 ; CHECK:       pred.store.continue16:
 ; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
 ; CHECK:       pred.store.if17:
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <16 x i1> [[TMP8]], i32 9
-; CHECK-NEXT:    store i1 [[TMP26]], ptr [[P]], align 1
+; CHECK-NEXT:    store i1 false, ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE18]]
 ; CHECK:       pred.store.continue18:
 ; CHECK-NEXT:    br i1 false, label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
 ; CHECK:       pred.store.if19:
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <16 x i1> [[TMP8]], i32 10
-; CHECK-NEXT:    store i1 [[TMP28]], ptr [[P]], align 1
+; CHECK-NEXT:    store i1 false, ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE20]]
 ; CHECK:       pred.store.continue20:
 ; CHECK-NEXT:    br i1 false, label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
 ; CHECK:       pred.store.if21:
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <16 x i1> [[TMP8]], i32 11
-; CHECK-NEXT:    store i1 [[TMP30]], ptr [[P]], align 1
+; CHECK-NEXT:    store i1 false, ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE22]]
 ; CHECK:       pred.store.continue22:
 ; CHECK-NEXT:    br i1 false, label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
 ; CHECK:       pred.store.if23:
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <16 x i1> [[TMP8]], i32 12
-; CHECK-NEXT:    store i1 [[TMP32]], ptr [[P]], align 1
+; CHECK-NEXT:    store i1 false, ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE24]]
 ; CHECK:       pred.store.continue24:
 ; CHECK-NEXT:    br i1 false, label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
 ; CHECK:       pred.store.if25:
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <16 x i1> [[TMP8]], i32 13
-; CHECK-NEXT:    store i1 [[TMP34]], ptr [[P]], align 1
+; CHECK-NEXT:    store i1 false, ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE26]]
 ; CHECK:       pred.store.continue26:
 ; CHECK-NEXT:    br i1 false, label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
 ; CHECK:       pred.store.if27:
-; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <16 x i1> [[TMP8]], i32 14
-; CHECK-NEXT:    store i1 [[TMP36]], ptr [[P]], align 1
+; CHECK-NEXT:    store i1 false, ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE28]]
 ; CHECK:       pred.store.continue28:
 ; CHECK-NEXT:    br i1 false, label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]]
 ; CHECK:       pred.store.if29:
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <16 x i1> [[TMP8]], i32 15
-; CHECK-NEXT:    store i1 [[TMP38]], ptr [[P]], align 1
+; CHECK-NEXT:    store i1 false, ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE30]]
 ; CHECK:       pred.store.continue30:
 ; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
index 83e2f84814ad..7d9ed7d6215c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
@@ -58,3 +58,212 @@ bb2:
 bb3:
   ret void
 }
+
+; Test case for https://github.com/llvm/llvm-project/issues/131359.
+define void @redundant_or_1(ptr %dst, i1 %c.0, i1 %c.1) {
+; CHECK-LABEL: @redundant_or_1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C_0:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i1> poison, i1 [[C_1:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT1]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT2]], splat (i1 true)
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 true, <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i8> [ <i8 0, i8 1, i8 2, i8 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE8]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <4 x i8> [[VEC_IND]], splat (i8 2)
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[TMP0]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP2]], <4 x i1> [[TMP4]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
+; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[TMP7]]
+; CHECK-NEXT:    store i32 0, ptr [[TMP8]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP5]], i32 1
+; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
+; CHECK:       pred.store.if3:
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP10]]
+; CHECK-NEXT:    store i32 0, ptr [[TMP11]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE4]]
+; CHECK:       pred.store.continue4:
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i1> [[TMP5]], i32 2
+; CHECK-NEXT:    br i1 [[TMP12]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
+; CHECK:       pred.store.if5:
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP13]]
+; CHECK-NEXT:    store i32 0, ptr [[TMP14]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE6]]
+; CHECK:       pred.store.continue6:
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i1> [[TMP5]], i32 3
+; CHECK-NEXT:    br i1 [[TMP15]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8]]
+; CHECK:       pred.store.if7:
+; CHECK-NEXT:    [[TMP16:%.*]] = add i32 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP16]]
+; CHECK-NEXT:    store i32 0, ptr [[TMP17]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
+; CHECK:       pred.store.continue8:
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_LATCH]], label [[THEN_1:%.*]]
+; CHECK:       then.1:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[IV]], 2
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP]], true
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR]], i1 [[C_0]], i1 false
+; CHECK-NEXT:    br i1 [[COND]], label [[THEN_2:%.*]], label [[LOOP_LATCH]]
+; CHECK:       then.2:
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[IV]]
+; CHECK-NEXT:    store i32 0, ptr [[GEP]], align 4
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 3
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  br i1 %c.0, label %loop.latch, label %then.1
+
+then.1:
+  %cmp = icmp eq i32 %iv, 2
+  %or = or i1 %cmp, true
+  %cond = select i1 %or, i1 %c.1, i1 false
+  br i1 %cond, label %then.2, label %loop.latch
+
+then.2:
+  %gep = getelementptr inbounds i32, ptr %dst, i32 %iv
+  store i32 0, ptr %gep, align 4
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add nuw nsw i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 3
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+define void @redundant_or_2(ptr %dst, i1 %c.0, i1 %c.1) {
+; CHECK-LABEL: @redundant_or_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C_1:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i1> poison, i1 [[C_0:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT1]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT2]], splat (i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 true, <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i8> [ <i8 0, i8 1, i8 2, i8 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE8]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ule <4 x i8> [[VEC_IND]], splat (i8 2)
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i1> [[TMP0]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0
+; CHECK-NEXT:    br i1 [[TMP5]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[TMP6]]
+; CHECK-NEXT:    store i32 0, ptr [[TMP7]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP4]], i32 1
+; CHECK-NEXT:    br i1 [[TMP8]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
+; CHECK:       pred.store.if3:
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP9]]
+; CHECK-NEXT:    store i32 0, ptr [[TMP10]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE4]]
+; CHECK:       pred.store.continue4:
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2
+; CHECK-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
+; CHECK:       pred.store.if5:
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP12]]
+; CHECK-NEXT:    store i32 0, ptr [[TMP13]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE6]]
+; CHECK:       pred.store.continue6:
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP4]], i32 3
+; CHECK-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8]]
+; CHECK:       pred.store.if7:
+; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP15]]
+; CHECK-NEXT:    store i32 0, ptr [[TMP16]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
+; CHECK:       pred.store.continue8:
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    br i1 [[C_0]], label [[LOOP_LATCH]], label [[THEN_1:%.*]]
+; CHECK:       then.1:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[IV]], 2
+; CHECK-NEXT:    [[OR:%.*]] = or i1 true, [[CMP]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR]], i1 [[C_1]], i1 false
+; CHECK-NEXT:    br i1 [[COND]], label [[THEN_2:%.*]], label [[LOOP_LATCH]]
+; CHECK:       then.2:
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[IV]]
+; CHECK-NEXT:    store i32 0, ptr [[GEP]], align 4
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 3
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  br i1 %c.0, label %loop.latch, label %then.1
+
+then.1:
+  %cmp = icmp eq i32 %iv, 2
+  %or = or i1 true, %cmp
+  %cond = select i1 %or, i1 %c.1, i1 false
+  br i1 %cond, label %then.2, label %loop.latch
+
+then.2:
+  %gep = getelementptr inbounds i32, ptr %dst, i32 %iv
+  store i32 0, ptr %gep, align 4
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add nuw nsw i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 3
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/ObjCARC/contract.ll b/llvm/test/Transforms/ObjCARC/contract.ll
index 70bd57a0c719..24f9a712ccd0 100644
--- a/llvm/test/Transforms/ObjCARC/contract.ll
+++ b/llvm/test/Transforms/ObjCARC/contract.ll
@@ -234,6 +234,22 @@ define void @test14(ptr %a, ptr %b) {
   ret void
 }
 
+define void @test15(ptr %x) {
+; CHECK-LABEL: define void @test15(
+; CHECK-SAME: ptr [[X:%.*]]) {
+; CHECK-NEXT:    [[Y:%.*]] = getelementptr inbounds ptr, ptr [[X]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = call ptr @llvm.objc.retain(ptr [[Y]]) #[[ATTR0:[0-9]+]]
+; CHECK-NEXT:    call void @use_pointer(ptr [[V0]])
+; CHECK-NEXT:    call void @use_pointer(ptr [[V0]])
+; CHECK-NEXT:    ret void
+;
+  %y = getelementptr inbounds ptr, ptr %x, i32 0
+  %v0 = call ptr @llvm.objc.retain(ptr %y) nounwind
+  call void @use_pointer(ptr %x)
+  call void @use_pointer(ptr %y)
+  ret void
+}
+
 declare void @llvm.objc.clang.arc.use(...) nounwind
 declare void @llvm.objc.clang.arc.noop.use(...) nounwind
 
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll b/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
index ffbacc1a8903..95bf296af9b0 100644
--- a/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
+++ b/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
@@ -1028,15 +1028,14 @@ define i32 @nodefaultwithholes(i32 %c) {
 ; CHECK-LABEL: @nodefaultwithholes(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[C:%.*]], 6
-; CHECK-NEXT:    br i1 [[TMP0]], label [[SWITCH_HOLE_CHECK:%.*]], label [[SW_DEFAULT:%.*]]
-; CHECK:       sw.default:
-; CHECK-NEXT:    call void @exit(i32 1)
-; CHECK-NEXT:    unreachable
-; CHECK:       switch.hole_check:
 ; CHECK-NEXT:    [[SWITCH_MASKINDEX:%.*]] = trunc i32 [[C]] to i8
 ; CHECK-NEXT:    [[SWITCH_SHIFTED:%.*]] = lshr i8 47, [[SWITCH_MASKINDEX]]
 ; CHECK-NEXT:    [[SWITCH_LOBIT:%.*]] = trunc i8 [[SWITCH_SHIFTED]] to i1
-; CHECK-NEXT:    br i1 [[SWITCH_LOBIT]], label [[SWITCH_LOOKUP:%.*]], label [[SW_DEFAULT]]
+; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[TMP0]], i1 [[SWITCH_LOBIT]], i1 false
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[SWITCH_LOOKUP:%.*]], label [[SW_DEFAULT:%.*]]
+; CHECK:       sw.default:
+; CHECK-NEXT:    call void @exit(i32 1)
+; CHECK-NEXT:    unreachable
 ; CHECK:       switch.lookup:
 ; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [6 x i32], ptr @switch.table.nodefaultwithholes, i32 0, i32 [[C]]
 ; CHECK-NEXT:    [[SWITCH_LOAD:%.*]] = load i32, ptr [[SWITCH_GEP]], align 4
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table_big.ll b/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table_big.ll
index 4ebf09ae3b12..fd6b21a7f9e6 100644
--- a/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table_big.ll
+++ b/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table_big.ll
@@ -143,17 +143,16 @@ define i32 @reachable_default_holes_0to31(i32 %x, i32 %y) {
 ; CHECK-LABEL: @reachable_default_holes_0to31(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[X:%.*]], 32
-; CHECK-NEXT:    br i1 [[TMP0]], label [[SWITCH_HOLE_CHECK:%.*]], label [[RETURN:%.*]]
-; CHECK:       switch.hole_check:
 ; CHECK-NEXT:    [[SWITCH_SHIFTED:%.*]] = lshr i32 -277094665, [[X]]
 ; CHECK-NEXT:    [[SWITCH_LOBIT:%.*]] = trunc i32 [[SWITCH_SHIFTED]] to i1
-; CHECK-NEXT:    br i1 [[SWITCH_LOBIT]], label [[SWITCH_LOOKUP:%.*]], label [[RETURN]]
+; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[TMP0]], i1 [[SWITCH_LOBIT]], i1 false
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[SWITCH_LOOKUP:%.*]], label [[RETURN:%.*]]
 ; CHECK:       switch.lookup:
 ; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [32 x i32], ptr @switch.table.reachable_default_holes_0to31, i32 0, i32 [[X]]
 ; CHECK-NEXT:    [[SWITCH_LOAD:%.*]] = load i32, ptr [[SWITCH_GEP]], align 4
 ; CHECK-NEXT:    br label [[RETURN]]
 ; CHECK:       return:
-; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[SWITCH_LOAD]], [[SWITCH_LOOKUP]] ], [ [[Y:%.*]], [[SWITCH_HOLE_CHECK]] ], [ [[Y]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[SWITCH_LOAD]], [[SWITCH_LOOKUP]] ], [ [[Y:%.*]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest.ll b/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest.ll
index 37f48a9a7e03..7b88ec338cf5 100644
--- a/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest.ll
+++ b/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest.ll
@@ -88,10 +88,9 @@ define void @one_pred_trunc_cond(i8 %v0, i8 %v1) {
 ; CHECK-LABEL: @one_pred_trunc_cond(
 ; CHECK-NEXT:  pred:
 ; CHECK-NEXT:    [[C0:%.*]] = icmp eq i8 [[V0:%.*]], 0
-; CHECK-NEXT:    br i1 [[C0]], label [[DISPATCH:%.*]], label [[FINAL_RIGHT:%.*]]
-; CHECK:       dispatch:
 ; CHECK-NEXT:    [[C1:%.*]] = trunc i8 [[V1:%.*]] to i1
-; CHECK-NEXT:    br i1 [[C1]], label [[FINAL_LEFT:%.*]], label [[FINAL_RIGHT]]
+; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[FINAL_LEFT:%.*]], label [[FINAL_RIGHT:%.*]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    ret void
 ; CHECK:       final_left:
diff --git a/llvm/unittests/ADT/APFloatTest.cpp b/llvm/unittests/ADT/APFloatTest.cpp
index 09356191345f..7a5fd83cd958 100644
--- a/llvm/unittests/ADT/APFloatTest.cpp
+++ b/llvm/unittests/ADT/APFloatTest.cpp
@@ -8347,4 +8347,13 @@ TEST(APFloatTest, AddOrSubtractSignificand) {
   Helper::runTest(true, false, 3, 0x10001, false, 7, 0x100, false, 6, 0x1e00,
                   lfLessThanHalf);
 }
+
+TEST(APFloatTest, hasSignBitInMSB) {
+  EXPECT_TRUE(APFloat::hasSignBitInMSB(APFloat::IEEEsingle()));
+  EXPECT_TRUE(APFloat::hasSignBitInMSB(APFloat::x87DoubleExtended()));
+  EXPECT_TRUE(APFloat::hasSignBitInMSB(APFloat::PPCDoubleDouble()));
+  EXPECT_TRUE(APFloat::hasSignBitInMSB(APFloat::IEEEquad()));
+  EXPECT_FALSE(APFloat::hasSignBitInMSB(APFloat::Float8E8M0FNU()));
+}
+
 } // namespace
diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
index 24822c847046..c954163cdeb3 100644
--- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
@@ -110,6 +110,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/StringMatcher.h"
diff --git a/llvm/utils/TableGen/Basic/SequenceToOffsetTable.h b/llvm/utils/TableGen/Basic/SequenceToOffsetTable.h
index 35a9abdc37c8..8da6fbef0672 100644
--- a/llvm/utils/TableGen/Basic/SequenceToOffsetTable.h
+++ b/llvm/utils/TableGen/Basic/SequenceToOffsetTable.h
@@ -15,15 +15,14 @@
 #define LLVM_UTILS_TABLEGEN_BASIC_SEQUENCETOOFFSETTABLE_H
 
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/TableGen/Main.h"
 #include <algorithm>
 #include <cassert>
 #include <functional>
 #include <map>
 
 namespace llvm {
-extern cl::opt<bool> EmitLongStrLiterals;
 
 inline void printChar(raw_ostream &OS, char C) {
   unsigned char UC(C);
diff --git a/llvm/utils/TableGen/Basic/TableGen.cpp b/llvm/utils/TableGen/Basic/TableGen.cpp
index 80ac93f2b54f..edb779150069 100644
--- a/llvm/utils/TableGen/Basic/TableGen.cpp
+++ b/llvm/utils/TableGen/Basic/TableGen.cpp
@@ -26,15 +26,6 @@
 
 using namespace llvm;
 
-namespace llvm {
-cl::opt<bool> EmitLongStrLiterals(
-    "long-string-literals",
-    cl::desc("when emitting large string tables, prefer string literals over "
-             "comma-separated char literals. This can be a readability and "
-             "compile-time performance win, but upsets some compilers"),
-    cl::Hidden, cl::init(true));
-} // end namespace llvm
-
 static cl::OptionCategory PrintEnumsCat("Options for -print-enums");
 static cl::opt<std::string> Class("class",
                                   cl::desc("Print Enum list for this class"),
diff --git a/llvm/utils/TableGen/SDNodeInfoEmitter.cpp b/llvm/utils/TableGen/SDNodeInfoEmitter.cpp
index 63ee0deb8711..64f03dae83e7 100644
--- a/llvm/utils/TableGen/SDNodeInfoEmitter.cpp
+++ b/llvm/utils/TableGen/SDNodeInfoEmitter.cpp
@@ -9,6 +9,7 @@
 #include "Basic/SequenceToOffsetTable.h"
 #include "Common/CodeGenDAGPatterns.h" // For SDNodeInfo.
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/StringToOffsetTable.h"
 #include "llvm/TableGen/TableGenBackend.h"
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clangd/test/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clangd/test/BUILD.gn
index 9d42409f1973..d40ce6424fe8 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clangd/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clangd/test/BUILD.gn
@@ -35,6 +35,7 @@ write_lit_config("lit_site_cfg") {
         rebase_path("//clang-tools-extra/clangd/test"),
 
     "CURRENT_TOOLS_DIR=",
+    "CLANGD_BUILD_DEXP=1",
     "CLANGD_ENABLE_REMOTE=0",
     "CLANGD_TIDY_CHECKS=1",
     "LLVM_HOST_TRIPLE=$llvm_current_triple",
diff --git a/llvm/utils/gn/secondary/llvm/lib/TableGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/TableGen/BUILD.gn
index d90df7bc0e57..b40fdf154b01 100644
--- a/llvm/utils/gn/secondary/llvm/lib/TableGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/TableGen/BUILD.gn
@@ -10,6 +10,7 @@ static_library("TableGen") {
     "Record.cpp",
     "SetTheory.cpp",
     "StringMatcher.cpp",
+    "StringToOffsetTable.cpp",
     "TGLexer.cpp",
     "TGParser.cpp",
     "TGTimer.cpp",
diff --git a/mlir/docs/DefiningDialects/Operations.md b/mlir/docs/DefiningDialects/Operations.md
index fafda816a388..88d58e0a1efb 100644
--- a/mlir/docs/DefiningDialects/Operations.md
+++ b/mlir/docs/DefiningDialects/Operations.md
@@ -1756,6 +1756,23 @@ that it has a value within the valid range of the enum. If their
 wrapper attribute instead of using a bare signless integer attribute
 for storage.
 
+### Enum properties
+
+Enums can be wrapped in properties so that they can be stored inline.
+This causes a value of the enum's C++ class to become a member of the operation's
+property struct and for the operation's verifier to check that the enum's value
+is a valid value for the enum.
+
+The basic wrapper is `EnumProp`, which simply takes an `EnumInfo`.
+
+A less ambiguous syntax, namely putting a mnemonic and `<>`s surrounding
+the enum is generated with `NamedEnumProp`, which takes a `*EnumInfo`
+and a mnemonic string, which becomes part of the property's syntax.
+
+Both of these `EnumProp` types have a `*EnumPropWithAttrForm`, which allows for
+transparently upgrading from `EnumAttr`s and optionally retaining those
+attributes in the generic form.
+
 ## Debugging Tips
 
 ### Run `mlir-tblgen` to see the generated content
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMEnums.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMEnums.td
index a9de78780645..34a30a00790e 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMEnums.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMEnums.td
@@ -485,17 +485,16 @@ def DISubprogramFlags : I32BitEnumAttr<
 // IntegerOverflowFlags
 //===----------------------------------------------------------------------===//
 
-def IOFnone : I32BitEnumAttrCaseNone<"none">;
-def IOFnsw  : I32BitEnumAttrCaseBit<"nsw", 0>;
-def IOFnuw  : I32BitEnumAttrCaseBit<"nuw", 1>;
+def IOFnone : I32BitEnumCaseNone<"none">;
+def IOFnsw  : I32BitEnumCaseBit<"nsw", 0>;
+def IOFnuw  : I32BitEnumCaseBit<"nuw", 1>;
 
-def IntegerOverflowFlags : I32BitEnumAttr<
+def IntegerOverflowFlags : I32BitEnum<
     "IntegerOverflowFlags",
     "LLVM integer overflow flags",
     [IOFnone, IOFnsw, IOFnuw]> {
   let separator = ", ";
   let cppNamespace = "::mlir::LLVM";
-  let genSpecializedAttr = 0;
   let printBitEnumPrimaryGroups = 1;
 }
 
@@ -504,6 +503,11 @@ def LLVM_IntegerOverflowFlagsAttr :
   let assemblyFormat = "`<` $value `>`";
 }
 
+def LLVM_IntegerOverflowFlagsProp :
+    NamedEnumPropWithAttrForm<IntegerOverflowFlags, "overflow", LLVM_IntegerOverflowFlagsAttr> {
+  let defaultValue = enum.cppType # "::" # "none";
+}
+
 //===----------------------------------------------------------------------===//
 // FastmathFlags
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index 7ffa880dc8da..e89e78aec714 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -60,7 +60,7 @@ class LLVM_IntArithmeticOpWithOverflowFlag<string mnemonic, string instName,
                                    list<Trait> traits = []> :
     LLVM_ArithmeticOpBase<AnySignlessInteger, mnemonic, instName,
     !listconcat([DeclareOpInterfaceMethods<IntegerOverflowFlagsInterface>], traits)> {
-  dag iofArg = (ins EnumProp<"IntegerOverflowFlags", "", "IntegerOverflowFlags::none">:$overflowFlags);
+  dag iofArg = (ins LLVM_IntegerOverflowFlagsProp:$overflowFlags);
   let arguments = !con(commonArgs, iofArg);
 
   string mlirBuilder = [{
@@ -69,7 +69,7 @@ class LLVM_IntArithmeticOpWithOverflowFlag<string mnemonic, string instName,
     $res = op;
   }];
   let assemblyFormat = [{
-    $lhs `,` $rhs `` custom<OverflowFlags>($overflowFlags) attr-dict `:` type($res)
+    $lhs `,` $rhs ($overflowFlags^)? attr-dict `:` type($res)
   }];
   string llvmBuilder =
     "$res = builder.Create" # instName #
@@ -563,10 +563,10 @@ class LLVM_CastOpWithOverflowFlag<string mnemonic, string instName, Type type,
                   Type resultType, list<Trait> traits = []> :
     LLVM_Op<mnemonic, !listconcat([Pure], [DeclareOpInterfaceMethods<IntegerOverflowFlagsInterface>], traits)>,
     LLVM_Builder<"$res = builder.Create" # instName # "($arg, $_resultType, /*Name=*/\"\", op.hasNoUnsignedWrap(), op.hasNoSignedWrap());"> {
-  let arguments = (ins type:$arg, EnumProp<"IntegerOverflowFlags", "", "IntegerOverflowFlags::none">:$overflowFlags);
+  let arguments = (ins type:$arg, LLVM_IntegerOverflowFlagsProp:$overflowFlags);
   let results = (outs resultType:$res);
   let builders = [LLVM_OneResultOpBuilder];
-  let assemblyFormat = "$arg `` custom<OverflowFlags>($overflowFlags) attr-dict `:` type($arg) `to` type($res)";
+  let assemblyFormat = "$arg ($overflowFlags^)? attr-dict `:` type($arg) `to` type($res)";
   string llvmInstName = instName;
   string mlirBuilder = [{
     auto op = $_builder.create<$_qualCppClassName>(
diff --git a/mlir/include/mlir/IR/EnumAttr.td b/mlir/include/mlir/IR/EnumAttr.td
index 931126a155fb..3f7f747ac20d 100644
--- a/mlir/include/mlir/IR/EnumAttr.td
+++ b/mlir/include/mlir/IR/EnumAttr.td
@@ -10,6 +10,7 @@
 #define ENUMATTR_TD
 
 include "mlir/IR/AttrTypeBase.td"
+include "mlir/IR/Properties.td"
 
 //===----------------------------------------------------------------------===//
 // Enum attribute kinds
@@ -552,6 +553,141 @@ class EnumAttr<Dialect dialect, EnumInfo enumInfo, string name = "",
   let assemblyFormat = "$value";
 }
 
+// A property wrapping by a C++ enum. This class will automatically create bytecode
+// serialization logic for the given enum, as well as arranging for parser and
+// printer calls.
+class EnumProp<EnumInfo enumInfo> : Property<enumInfo.cppType, enumInfo.summary> {
+  EnumInfo enum = enumInfo;
+
+  let description = enum.description;
+  let predicate = !if(
+    !isa<BitEnumBase>(enum),
+    CPred<"(static_cast<" # enum.underlyingType # ">($_self) & ~" # !cast<BitEnumBase>(enum).validBits # ") == 0">,
+    Or<!foreach(case, enum.enumerants, CPred<"$_self == " # enum.cppType # "::" # case.symbol>)>);
+
+  let convertFromAttribute = [{
+    auto intAttr = ::mlir::dyn_cast_if_present<::mlir::IntegerAttr>($_attr);
+    if (!intAttr) {
+      return $_diag() << "expected IntegerAttr storage for }] #
+        enum.cppType # [{";
+    }
+    $_storage = static_cast<}] # enum.cppType # [{>(intAttr.getValue().getZExtValue());
+    return ::mlir::success();
+  }];
+
+  let convertToAttribute = [{
+    return ::mlir::IntegerAttr::get(::mlir::IntegerType::get($_ctxt, }] # enum.bitwidth
+      # [{), static_cast<}] # enum.underlyingType #[{>($_storage));
+  }];
+
+  let writeToMlirBytecode = [{
+    $_writer.writeVarInt(static_cast<uint64_t>($_storage));
+  }];
+
+  let readFromMlirBytecode = [{
+    uint64_t rawValue;
+    if (::mlir::failed($_reader.readVarInt(rawValue)))
+      return ::mlir::failure();
+    if (rawValue > std::numeric_limits<}] # enum.underlyingType # [{>::max())
+      return ::mlir::failure();
+    $_storage = static_cast<}] # enum.cppType # [{>(rawValue);
+  }];
+
+  let optionalParser = [{
+    auto value = ::mlir::FieldParser<std::optional<}] # enum.cppType # [{>>::parse($_parser);
+    if (::mlir::failed(value))
+      return ::mlir::failure();
+    if (!(value->has_value()))
+      return std::nullopt;
+    $_storage = std::move(**value);
+  }];
+}
+
+// Enum property that can have been (or, if `storeInCustomAttribute` is true, will also
+// be stored as) an attribute, in addition to being stored as an integer attribute.
+class EnumPropWithAttrForm<EnumInfo enumInfo, Attr attributeForm>
+    : EnumProp<enumInfo> {
+  Attr attrForm = attributeForm;
+  bit storeInCustomAttribute = 0;
+
+  let convertFromAttribute = [{
+    auto customAttr = ::mlir::dyn_cast_if_present<}]
+    # attrForm.storageType # [{>($_attr);
+    if (customAttr) {
+      $_storage = customAttr.getValue();
+      return ::mlir::success();
+    }
+    auto intAttr = ::mlir::dyn_cast_if_present<::mlir::IntegerAttr>($_attr);
+    if (!intAttr) {
+      return $_diag() << "expected }] # attrForm.storageType
+      # [{ or IntegerAttr storage for }] # enum.cppType # [{";
+    }
+    $_storage = static_cast<}] # enum.cppType # [{>(intAttr.getValue().getZExtValue());
+    return ::mlir::success();
+  }];
+
+  let convertToAttribute = !if(storeInCustomAttribute, [{
+    return }] # attrForm.storageType # [{::get($_ctxt, $_storage);
+  }], [{
+    return ::mlir::IntegerAttr::get(::mlir::IntegerType::get($_ctxt, }] # enumInfo.bitwidth
+      # [{), static_cast<}] # enum.underlyingType #[{>($_storage));
+  }]);
+}
+
+class _namedEnumPropFields<string cppType, string mnemonic> {
+  code parser = [{
+    if ($_parser.parseKeyword("}] # mnemonic # [{")
+        || $_parser.parseLess()) {
+      return ::mlir::failure();
+    }
+    auto parseRes = ::mlir::FieldParser<}] # cppType # [{>::parse($_parser);
+    if (::mlir::failed(parseRes) ||
+        ::mlir::failed($_parser.parseGreater())) {
+      return ::mlir::failure();
+    }
+    $_storage = *parseRes;
+  }];
+
+  code optionalParser = [{
+    if ($_parser.parseOptionalKeyword("}] # mnemonic # [{")) {
+      return std::nullopt;
+    }
+    if ($_parser.parseLess()) {
+      return ::mlir::failure();
+    }
+    auto parseRes = ::mlir::FieldParser<}] # cppType # [{>::parse($_parser);
+    if (::mlir::failed(parseRes) ||
+        ::mlir::failed($_parser.parseGreater())) {
+      return ::mlir::failure();
+    }
+    $_storage = *parseRes;
+  }];
+
+  code printer = [{
+    $_printer << "}] # mnemonic # [{<" << $_storage << ">";
+  }];
+}
+
+// An EnumProp which, when printed, is surrounded by mnemonic<>.
+// For example, if the enum can be a, b, or c, and the mnemonic is foo,
+// the format of this property will be "foo<a>", "foo<b>", or "foo<c>".
+class NamedEnumProp<EnumInfo enumInfo, string name>
+    : EnumProp<enumInfo> {
+  string mnemonic = name;
+  let parser = _namedEnumPropFields<enum.cppType, mnemonic>.parser;
+  let optionalParser = _namedEnumPropFields<enum.cppType, mnemonic>.optionalParser;
+  let printer = _namedEnumPropFields<enum.cppType, mnemonic>.printer;
+}
+
+// A `NamedEnumProp` with an attribute form as in `EnumPropWithAttrForm`.
+class NamedEnumPropWithAttrForm<EnumInfo enumInfo, string name, Attr attributeForm>
+    : EnumPropWithAttrForm<enumInfo, attributeForm> {
+  string mnemonic = name;
+  let parser = _namedEnumPropFields<enum.cppType, mnemonic>.parser;
+  let optionalParser = _namedEnumPropFields<enum.cppType, mnemonic>.optionalParser;
+  let printer = _namedEnumPropFields<enumInfo.cppType, mnemonic>.printer;
+}
+
 class _symbolToValue<EnumInfo enumInfo, string case> {
   defvar cases =
     !filter(iter, enumInfo.enumerants, !eq(iter.str, case));
diff --git a/mlir/include/mlir/IR/Properties.td b/mlir/include/mlir/IR/Properties.td
index 8bd834379040..739df03c7ef2 100644
--- a/mlir/include/mlir/IR/Properties.td
+++ b/mlir/include/mlir/IR/Properties.td
@@ -239,25 +239,6 @@ def I64Prop : IntProp<"int64_t">;
 def I32Property : IntProp<"int32_t">, Deprecated<"moved to shorter name I32Prop">;
 def I64Property : IntProp<"int64_t">, Deprecated<"moved to shorter name I64Prop">;
 
-class EnumProp<string storageTypeParam, string desc = "", string default = ""> :
-    Property<storageTypeParam, desc> {
-  // TODO:  implement predicate for enum validity.
-  let writeToMlirBytecode = [{
-    $_writer.writeVarInt(static_cast<uint64_t>($_storage));
-  }];
-  let readFromMlirBytecode = [{
-    uint64_t val;
-    if (failed($_reader.readVarInt(val)))
-      return ::mlir::failure();
-    $_storage = static_cast<}] # storageTypeParam # [{>(val);
-  }];
-  let defaultValue = default;
-}
-
-class EnumProperty<string storageTypeParam, string desc = "", string default = ""> :
-  EnumProp<storageTypeParam, desc, default>,
-  Deprecated<"moved to shorter name EnumProp">;
-
 // Note: only a class so we can deprecate the old name
 class _cls_StringProp : Property<"std::string", "string"> {
   let interfaceType = "::llvm::StringRef";
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 4891dab3aa1d..c6c695b442b4 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -136,9 +136,13 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
   matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     Location loc = op->getLoc();
+    Value initShflValue = adaptor.getValue();
+    Type shflType = initShflValue.getType();
     // TODO: Add support for non 32-bit shuffle values.
-    if (adaptor.getValue().getType().getIntOrFloatBitWidth() != 32)
-      return failure();
+    if (!shflType.isIntOrFloat() || shflType.getIntOrFloatBitWidth() != 32)
+      return rewriter.notifyMatchFailure(
+          op, "only 32-bit int/float types are supported");
+
     const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
     Value srcLaneId = getLaneId(rewriter, loc, indexBitwidth);
 
@@ -175,16 +179,14 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
     Value two = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 2);
     Value dwordAlignedDstLane =
         rewriter.create<LLVM::ShlOp>(loc, int32Type, selectDstLane, two);
-    Value initShflValue = adaptor.getValue();
-    if (adaptor.getValue().getType().isF32()) {
+    if (shflType.isF32()) {
       initShflValue =
           rewriter.create<LLVM::BitcastOp>(loc, int32Type, initShflValue);
     }
     Value shflValue = rewriter.create<ROCDL::DsBpermuteOp>(
         loc, int32Type, dwordAlignedDstLane, initShflValue);
-    if (adaptor.getValue().getType().isF32()) {
-      shflValue = rewriter.create<LLVM::BitcastOp>(
-          loc, adaptor.getValue().getType(), shflValue);
+    if (shflType.isF32()) {
+      shflValue = rewriter.create<LLVM::BitcastOp>(loc, shflType, shflValue);
     }
     rewriter.replaceOp(op, {shflValue, isActiveSrcLane});
     return success();
diff --git a/mlir/lib/Dialect/GPU/Transforms/ShuffleRewriter.cpp b/mlir/lib/Dialect/GPU/Transforms/ShuffleRewriter.cpp
index 4bd4da25f6e5..9f2900214e8b 100644
--- a/mlir/lib/Dialect/GPU/Transforms/ShuffleRewriter.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/ShuffleRewriter.cpp
@@ -40,8 +40,9 @@ struct GpuShuffleRewriter : public OpRewritePattern<gpu::ShuffleOp> {
     auto i64 = rewriter.getI64Type();
 
     // If the type of the value is either i32 or f32, the op is already valid.
-    if (valueType.getIntOrFloatBitWidth() == 32)
-      return failure();
+    if (!valueType.isIntOrFloat() || valueType.getIntOrFloatBitWidth() != 64)
+      return rewriter.notifyMatchFailure(
+          op, "only 64-bit int/float types are supported");
 
     Value lo, hi;
 
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index 28d2019b225c..d1d00ca9681e 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -50,71 +50,6 @@ using mlir::LLVM::tailcallkind::getMaxEnumValForTailCallKind;
 #include "mlir/Dialect/LLVMIR/LLVMOpsDialect.cpp.inc"
 
 //===----------------------------------------------------------------------===//
-// Property Helpers
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// IntegerOverflowFlags
-//===----------------------------------------------------------------------===//
-
-namespace mlir {
-static Attribute convertToAttribute(MLIRContext *ctx,
-                                    IntegerOverflowFlags flags) {
-  return IntegerOverflowFlagsAttr::get(ctx, flags);
-}
-
-static LogicalResult
-convertFromAttribute(IntegerOverflowFlags &flags, Attribute attr,
-                     function_ref<InFlightDiagnostic()> emitError) {
-  auto flagsAttr = dyn_cast<IntegerOverflowFlagsAttr>(attr);
-  if (!flagsAttr) {
-    return emitError() << "expected 'overflowFlags' attribute to be an "
-                          "IntegerOverflowFlagsAttr, but got "
-                       << attr;
-  }
-  flags = flagsAttr.getValue();
-  return success();
-}
-} // namespace mlir
-
-static ParseResult parseOverflowFlags(AsmParser &p,
-                                      IntegerOverflowFlags &flags) {
-  if (failed(p.parseOptionalKeyword("overflow"))) {
-    flags = IntegerOverflowFlags::none;
-    return success();
-  }
-  if (p.parseLess())
-    return failure();
-  do {
-    StringRef kw;
-    SMLoc loc = p.getCurrentLocation();
-    if (p.parseKeyword(&kw))
-      return failure();
-    std::optional<IntegerOverflowFlags> flag =
-        symbolizeIntegerOverflowFlags(kw);
-    if (!flag)
-      return p.emitError(loc,
-                         "invalid overflow flag: expected nsw, nuw, or none");
-    flags = flags | *flag;
-  } while (succeeded(p.parseOptionalComma()));
-  return p.parseGreater();
-}
-
-static void printOverflowFlags(AsmPrinter &p, Operation *op,
-                               IntegerOverflowFlags flags) {
-  if (flags == IntegerOverflowFlags::none)
-    return;
-  p << " overflow<";
-  SmallVector<StringRef, 2> strs;
-  if (bitEnumContainsAny(flags, IntegerOverflowFlags::nsw))
-    strs.push_back("nsw");
-  if (bitEnumContainsAny(flags, IntegerOverflowFlags::nuw))
-    strs.push_back("nuw");
-  llvm::interleaveComma(strs, p);
-  p << ">";
-}
-
-//===----------------------------------------------------------------------===//
 // Attribute Helpers
 //===----------------------------------------------------------------------===//
 
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
index c4ef7d0bb9ff..47368532df16 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
@@ -731,9 +731,56 @@ struct ConcatSliceOptimization : public OpRewritePattern<tosa::SliceOp> {
   }
 };
 
+// Update size operand of tosa.slice if size has dynamic dims but corresponding
+// output dim is static
+struct SliceDynamicSizeCanonicalization
+    : public OpRewritePattern<tosa::SliceOp> {
+  using OpRewritePattern<tosa::SliceOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(tosa::SliceOp sliceOp,
+                                PatternRewriter &rewriter) const override {
+    ShapedType resultType = cast<ShapedType>(sliceOp.getType());
+
+    ElementsAttr sizeElems;
+    if (!matchPattern(sliceOp.getSize(), m_Constant(&sizeElems))) {
+      return rewriter.notifyMatchFailure(
+          sliceOp, "size of slice must be a static ranked shape");
+    }
+
+    llvm::SmallVector<int64_t> sliceSizes =
+        llvm::to_vector(sizeElems.getValues<int64_t>());
+
+    bool replaceSliceSize{false};
+    // if size op has -1 indicating dynamic shape but corresponding dim on the
+    // output is statically known, update size to match with known output dim
+    // shape
+    for (const auto &[index, size] : llvm::enumerate(sliceSizes)) {
+      if (size == -1 && !resultType.isDynamicDim(index)) {
+        sliceSizes[index] = resultType.getDimSize(index);
+        replaceSliceSize = true;
+      }
+    }
+
+    if (!replaceSliceSize) {
+      return rewriter.notifyMatchFailure(
+          sliceOp, "no dimension of size of slice is dynamic that resolves "
+                   "to static output shape");
+    }
+
+    auto size_op = getTosaConstShape(rewriter, sliceOp.getLoc(), sliceSizes);
+    auto newSliceOp = rewriter.create<tosa::SliceOp>(
+        sliceOp.getLoc(), sliceOp.getType(), sliceOp.getInput1(),
+        sliceOp.getStart(), size_op);
+
+    rewriter.replaceOp(sliceOp, newSliceOp.getResult());
+    return success();
+  }
+};
+
 void SliceOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                           MLIRContext *context) {
-  results.add<ConcatSliceOptimization>(context);
+  results.add<ConcatSliceOptimization, SliceDynamicSizeCanonicalization>(
+      context);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-unsupported.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-unsupported.mlir
new file mode 100644
index 000000000000..90f2e5f047cd
--- /dev/null
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-unsupported.mlir
@@ -0,0 +1,13 @@
+// RUN: mlir-opt %s -convert-gpu-to-rocdl -verify-diagnostics
+
+gpu.module @test_module {
+  // ROCDL lowering only suport shuffles for 32bit ints/floats, but they
+  // shouldn't crash on unsupported types.
+  func.func @gpu_shuffle_unsupported(%arg0 : vector<4xf16>) -> vector<4xf16> {
+    %offset = arith.constant 4 : i32
+    %width = arith.constant 64 : i32
+    // expected-error @+1 {{failed to legalize operation 'gpu.shuffle'}}
+    %shfl, %pred = gpu.shuffle xor %arg0, %offset, %width : vector<4xf16>
+    return %shfl : vector<4xf16>
+  }
+}
diff --git a/mlir/test/Dialect/GPU/shuffle-rewrite.mlir b/mlir/test/Dialect/GPU/shuffle-rewrite.mlir
index 461825820153..c0ccae05a057 100644
--- a/mlir/test/Dialect/GPU/shuffle-rewrite.mlir
+++ b/mlir/test/Dialect/GPU/shuffle-rewrite.mlir
@@ -49,3 +49,14 @@ module {
     return
   }
 }
+
+// -----
+
+// CHECK-LABEL: @gpu_shuffle_unsupported
+func.func @gpu_shuffle_unsupported(%arg0 : vector<4xf16>) -> vector<4xf16> {
+  %offset = arith.constant 4 : i32
+  %width = arith.constant 64 : i32
+  // CHECK: gpu.shuffle xor %{{.*}}, %{{.*}}, %{{.*}} : vector<4xf16>
+  %shfl, %pred = gpu.shuffle xor %arg0, %offset, %width : vector<4xf16>
+  return %shfl : vector<4xf16>
+}
diff --git a/mlir/test/Dialect/Tosa/canonicalize.mlir b/mlir/test/Dialect/Tosa/canonicalize.mlir
index b366b4f1e4fd..d153474593d8 100644
--- a/mlir/test/Dialect/Tosa/canonicalize.mlir
+++ b/mlir/test/Dialect/Tosa/canonicalize.mlir
@@ -1212,3 +1212,18 @@ func.func @do_not_fold_intdiv_division_by_0() -> tensor<1x24x2xi32> {
   %16 = tosa.intdiv %4, %1 : (tensor<1x24x2xi32>, tensor<1x24x2xi32>) -> tensor<1x24x2xi32>
   return %16 : tensor<1x24x2xi32>
 }
+
+
+// -----
+// CHECK-LABEL:   func.func @slice_dynamic_size_static_output_canonicalize(
+// CHECK-SAME:                     %[[ARG0:.*]]: tensor<2x60x59x?xf32>) -> tensor<2x60x58x?xf32> {
+// CHECK:           %[[START:.*]] = tosa.const_shape  {values = dense<0> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK:           %[[SIZE:.*]] = tosa.const_shape  {values = dense<[2, 60, 58, -1]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK:           %[[SLICE:.*]] = tosa.slice %[[ARG0]], %[[START]], %[[SIZE]] : (tensor<2x60x59x?xf32>, !tosa.shape<4>, !tosa.shape<4>) -> tensor<2x60x58x?xf32>
+// CHECK:           return %[[SLICE]]
+func.func @slice_dynamic_size_static_output_canonicalize(%arg0: tensor<2x60x59x?xf32>) -> tensor<2x60x58x?xf32> {
+    %0 = tosa.const_shape  {values = dense<0> : tensor<4xindex>} : () -> !tosa.shape<4>
+    %1 = tosa.const_shape  {values = dense<[-1, 60, 58, -1]> : tensor<4xindex>} : () -> !tosa.shape<4>
+    %2 = tosa.slice %arg0, %0, %1 : (tensor<2x60x59x?xf32>, !tosa.shape<4>, !tosa.shape<4>) -> tensor<2x60x58x?xf32>
+    return %2 : tensor<2x60x58x?xf32>
+  }
diff --git a/mlir/test/IR/enum-attr-invalid.mlir b/mlir/test/IR/enum-attr-invalid.mlir
index 923736f28dad..2f240a56c987 100644
--- a/mlir/test/IR/enum-attr-invalid.mlir
+++ b/mlir/test/IR/enum-attr-invalid.mlir
@@ -28,3 +28,78 @@ func.func @test_parse_invalid_attr() -> () {
   // expected-error@+1 {{failed to parse TestEnumAttr parameter 'value'}}
   test.op_with_enum 1 : index
 }
+
+// -----
+
+func.func @test_non_keyword_prop_enum() -> () {
+  // expected-error@+2 {{expected keyword for a test enum}}
+  // expected-error@+1 {{invalid value for property value, expected a test enum}}
+  test.op_with_enum_prop 0
+  return
+}
+
+// -----
+
+func.func @test_wrong_keyword_prop_enum() -> () {
+  // expected-error@+2 {{expected one of [first, second, third] for a test enum, got: fourth}}
+  // expected-error@+1 {{invalid value for property value, expected a test enum}}
+  test.op_with_enum_prop fourth
+}
+
+// -----
+
+func.func @test_bad_integer() -> () {
+  // expected-error@+1 {{op property 'value' failed to satisfy constraint: a test enum}}
+  "test.op_with_enum_prop"() <{value = 4 : i32}> {} : () -> ()
+}
+
+// -----
+
+func.func @test_bit_enum_prop_not_keyword() -> () {
+  // expected-error@+2 {{expected keyword for a test bit enum}}
+  // expected-error@+1 {{invalid value for property value1, expected a test bit enum}}
+  test.op_with_bit_enum_prop 0
+  return
+}
+
+// -----
+
+func.func @test_bit_enum_prop_wrong_keyword() -> () {
+  // expected-error@+2 {{expected one of [read, write, execute] for a test bit enum, got: chroot}}
+  // expected-error@+1 {{invalid value for property value1, expected a test bit enum}}
+  test.op_with_bit_enum_prop read, chroot : ()
+  return
+}
+
+// -----
+
+func.func @test_bit_enum_prop_bad_value() -> () {
+  // expected-error@+1 {{op property 'value2' failed to satisfy constraint: a test bit enum}}
+  "test.op_with_bit_enum_prop"() <{value1 = 7 : i32, value2 = 8 : i32}> {} : () -> ()
+  return
+}
+
+// -----
+
+func.func @test_bit_enum_prop_named_wrong_keyword() -> () {
+  // expected-error@+2 {{expected 'bit_enum'}}
+  // expected-error@+1 {{invalid value for property value1, expected a test bit enum}}
+  test.op_with_bit_enum_prop_named foo<read, execute>
+  return
+}
+
+// -----
+
+func.func @test_bit_enum_prop_named_not_open() -> () {
+  // expected-error@+2 {{expected '<'}}
+  // expected-error@+1 {{invalid value for property value1, expected a test bit enum}}
+  test.op_with_bit_enum_prop_named bit_enum read, execute>
+}
+
+// -----
+
+func.func @test_bit_enum_prop_named_not_closed() -> () {
+  // expected-error@+2 {{expected '>'}}
+  // expected-error@+1 {{invalid value for property value1, expected a test bit enum}}
+  test.op_with_bit_enum_prop_named bit_enum<read, execute +
+}
diff --git a/mlir/test/IR/enum-attr-roundtrip.mlir b/mlir/test/IR/enum-attr-roundtrip.mlir
index 36e605bdbff4..f1f09f977b7d 100644
--- a/mlir/test/IR/enum-attr-roundtrip.mlir
+++ b/mlir/test/IR/enum-attr-roundtrip.mlir
@@ -35,3 +35,48 @@ func.func @test_match_op_with_bit_enum() -> () {
   test.op_with_bit_enum <execute, write> tag 0 : i32
   return
 }
+
+// CHECK-LABEL: @test_enum_prop
+func.func @test_enum_prop() -> () {
+  // CHECK: test.op_with_enum_prop first
+  test.op_with_enum_prop first
+
+  // CHECK: test.op_with_enum_prop first
+  "test.op_with_enum_prop"() <{value = 0 : i32}> {} : () -> ()
+
+  // CHECK: test.op_with_enum_prop_attr_form <{value = 0 : i32}>
+  test.op_with_enum_prop_attr_form <{value = 0 : i32}>
+  // CHECK: test.op_with_enum_prop_attr_form <{value = 1 : i32}>
+  test.op_with_enum_prop_attr_form <{value = #test<enum second>}>
+
+  // CHECK: test.op_with_enum_prop_attr_form_always <{value = #test<enum first>}>
+  test.op_with_enum_prop_attr_form_always <{value = #test<enum first>}>
+  // CHECK: test.op_with_enum_prop_attr_form_always  <{value = #test<enum second>}
+  test.op_with_enum_prop_attr_form_always <{value = #test<enum second>}>
+
+  return
+}
+
+// CHECK-LABEL @test_bit_enum_prop()
+func.func @test_bit_enum_prop() -> () {
+  // CHECK: test.op_with_bit_enum_prop read : ()
+  test.op_with_bit_enum_prop read read : ()
+
+  // CHECK: test.op_with_bit_enum_prop read, write write, execute
+  test.op_with_bit_enum_prop read, write write, execute : ()
+
+  // CHECK: test.op_with_bit_enum_prop read, execute write
+  "test.op_with_bit_enum_prop"() <{value1 = 5 : i32, value2 = 2 : i32}> {} : () -> ()
+
+  // CHECK: test.op_with_bit_enum_prop read, write, execute
+  test.op_with_bit_enum_prop read, write, execute : ()
+
+  // CHECK: test.op_with_bit_enum_prop_named bit_enum<read>{{$}}
+  test.op_with_bit_enum_prop_named bit_enum<read> bit_enum<read>
+  // CHECK: test.op_with_bit_enum_prop_named bit_enum<read, write> bit_enum<write, execute>
+  test.op_with_bit_enum_prop_named bit_enum<read, write> bit_enum<write, execute>
+  // CHECK: test.op_with_bit_enum_prop_named bit_enum<read, write, execute>
+  test.op_with_bit_enum_prop_named bit_enum<read, write, execute>
+
+  return
+}
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index 31be00ace138..85a49e05d4c7 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -424,6 +424,52 @@ def : Pat<(OpWithEnum ConstantEnumCase<TestEnumAttr, "first">:$value,
                       ConstantAttr<I32Attr, "1">)>;
 
 //===----------------------------------------------------------------------===//
+// Test Enum Properties
+//===----------------------------------------------------------------------===//
+
+// Define the enum property.
+def TestEnumProp : EnumProp<TestEnum>;
+// Define an op that contains the enum property.
+def OpWithEnumProp : TEST_Op<"op_with_enum_prop"> {
+  let arguments = (ins TestEnumProp:$value);
+  let assemblyFormat = "$value attr-dict";
+}
+
+def TestEnumPropAttrForm : EnumPropWithAttrForm<TestEnum, TestEnumAttr>;
+def OpWithEnumPropAttrForm : TEST_Op<"op_with_enum_prop_attr_form"> {
+  let arguments = (ins TestEnumPropAttrForm:$value);
+  let assemblyFormat = "prop-dict attr-dict";
+}
+
+def TestEnumPropAttrFormAlways : EnumPropWithAttrForm<TestEnum, TestEnumAttr> {
+  let storeInCustomAttribute = 1;
+}
+def OpWithEnumPropAttrFormAlways : TEST_Op<"op_with_enum_prop_attr_form_always"> {
+  let arguments = (ins TestEnumPropAttrFormAlways:$value);
+  let assemblyFormat = "prop-dict attr-dict";
+}
+
+def TestBitEnumProp : EnumProp<TestBitEnum> {
+  let defaultValue = TestBitEnum.cppType # "::Read";
+}
+def OpWithTestBitEnum : TEST_Op<"op_with_bit_enum_prop"> {
+  let arguments = (ins
+    TestBitEnumProp:$value1,
+    TestBitEnumProp:$value2);
+  let assemblyFormat = "$value1 ($value2^)? attr-dict `:` `(``)`";
+}
+
+def TestBitEnumPropNamed : NamedEnumProp<TestBitEnum, "bit_enum"> {
+  let defaultValue = TestBitEnum.cppType # "::Read";
+}
+def OpWithBitEnumPropNamed : TEST_Op<"op_with_bit_enum_prop_named"> {
+  let arguments = (ins
+    TestBitEnumPropNamed:$value1,
+    TestBitEnumPropNamed:$value2);
+  let assemblyFormat = "$value1 ($value2^)? attr-dict";
+}
+
+//===----------------------------------------------------------------------===//
 // Test Bit Enum Attributes
 //===----------------------------------------------------------------------===//
author	Aiden Grossman <aidengrossman@google.com>	2025-04-14 07:38:01 +0000
committer	Aiden Grossman <aidengrossman@google.com>	2025-04-14 07:38:01 +0000
commit	0b43a0423bbaa22384d522050a295eb564116d95 (patch)
tree	18b4111bcb0563e9f7279666299318617dacc017
parent	b7163d6a2490a7517cb8a526e36a877a4fd7bede (diff)
parent	97bc9137e545423334b00d60ab64855ccc434c3a (diff)